aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-28 18:07:55 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-28 18:07:55 -0400
commitcb28a1bbdb4790378e7366d6c9ee1d2340b84f92 (patch)
tree316436f77dac75335fd2c3ef5f109e71606c50d3 /arch/x86
parentb6d4f7e3ef25beb8c658c97867d98883e69dc544 (diff)
parentf934fb19ef34730263e6afc01e8ec27a8a71470f (diff)
Merge branch 'linus' into core/generic-dma-coherent
Conflicts: arch/x86/Kconfig Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig37
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/compressed/misc.c39
-rw-r--r--arch/x86/boot/edd.c5
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/ia32/ia32_signal.c11
-rw-r--r--arch/x86/ia32/ia32entry.S115
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c14
-rw-r--r--arch/x86/kernel/amd_iommu.c235
-rw-r--r--arch/x86/kernel/amd_iommu_init.c361
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic_32.c175
-rw-r--r--arch/x86/kernel/apic_64.c26
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c23
-rw-r--r--arch/x86/kernel/cpu/common_64.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c157
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/e820.c33
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/entry_32.S79
-rw-r--r--arch/x86/kernel/entry_64.S175
-rw-r--r--arch/x86/kernel/genapic_flat_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c27
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c10
-rw-r--r--arch/x86/kernel/io_apic_32.c53
-rw-r--r--arch/x86/kernel/io_apic_64.c53
-rw-r--r--arch/x86/kernel/io_delay.c3
-rw-r--r--arch/x86/kernel/ipi.c6
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c8
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c39
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode.c23
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c208
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c197
-rw-r--r--arch/x86/kernel/paravirt.c31
-rw-r--r--arch/x86/kernel/pci-calgary_64.c160
-rw-r--r--arch/x86/kernel/pci-dma.c50
-rw-r--r--arch/x86/kernel/pci-gart_64.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c16
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c4
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c58
-rw-r--r--arch/x86/kernel/ptrace.c151
-rw-r--r--arch/x86/kernel/reboot.c22
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S174
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c62
-rw-r--r--arch/x86/kernel/smpboot.c116
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/traps_32.c118
-rw-r--r--arch/x86/kernel/traps_64.c48
-rw-r--r--arch/x86/kernel/visws_quirks.c42
-rw-r--r--arch/x86/kernel/vmi_32.c1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c24
-rw-r--r--arch/x86/kvm/i8259.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c14
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c69
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h28
-rw-r--r--arch/x86/kvm/svm.c141
-rw-r--r--arch/x86/kvm/vmx.c252
-rw-r--r--arch/x86/kvm/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c406
-rw-r--r--arch/x86/kvm/x86_emulate.c257
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/mach-default/setup.c34
-rw-r--r--arch/x86/mach-es7000/es7000plat.c8
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/discontig_32.c3
-rw-r--r--arch/x86/mm/dump_pagetables.c10
-rw-r--r--arch/x86/mm/gup.c295
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c149
-rw-r--r--arch/x86/mm/ioremap.c8
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/numa_64.c8
-rw-r--r--arch/x86/mm/pat.c94
-rw-r--r--arch/x86/mm/pgtable_32.c47
-rw-r--r--arch/x86/oprofile/nmi_int.c36
-rw-r--r--arch/x86/pci/Makefile12
-rw-r--r--arch/x86/pci/early.c16
-rw-r--r--arch/x86/pci/i386.c1
-rw-r--r--arch/x86/pci/legacy.c9
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)4
-rw-r--r--arch/x86/pci/pci.h3
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vma.c11
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c697
-rw-r--r--arch/x86/xen/mmu.c324
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c310
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
151 files changed, 5183 insertions, 2431 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 18a58ecfe684..3d0f2b6a5a16 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,13 +21,17 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_IOREMAP_PROT
25 select HAVE_GET_USER_PAGES_FAST
24 select HAVE_KPROBES 26 select HAVE_KPROBES
27 select ARCH_WANT_OPTIONAL_GPIOLIB
25 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
26 select HAVE_DYNAMIC_FTRACE 29 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE 30 select HAVE_FTRACE
28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 31 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
29 select HAVE_ARCH_KGDB if !X86_VOYAGER 32 select HAVE_ARCH_KGDB if !X86_VOYAGER
30 select HAVE_GENERIC_DMA_COHERENT if X86_32 33 select HAVE_GENERIC_DMA_COHERENT if X86_32
34 select HAVE_EFFICIENT_UNALIGNED_ACCESS
31 35
32config ARCH_DEFCONFIG 36config ARCH_DEFCONFIG
33 string 37 string
@@ -330,20 +334,6 @@ config X86_BIGSMP
330 334
331endif 335endif
332 336
333config X86_RDC321X
334 bool "RDC R-321x SoC"
335 depends on X86_32
336 select M486
337 select X86_REBOOTFIXUPS
338 select GENERIC_GPIO
339 select LEDS_CLASS
340 select LEDS_GPIO
341 select NEW_LEDS
342 help
343 This option is needed for RDC R-321x system-on-chip, also known
344 as R-8610-(G).
345 If you don't have one of these chips, you should say N here.
346
347config X86_VSMP 337config X86_VSMP
348 bool "Support for ScaleMP vSMP" 338 bool "Support for ScaleMP vSMP"
349 select PARAVIRT 339 select PARAVIRT
@@ -367,6 +357,16 @@ config X86_VISWS
367 A kernel compiled for the Visual Workstation will run on general 357 A kernel compiled for the Visual Workstation will run on general
368 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 358 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
369 359
360config X86_RDC321X
361 bool "RDC R-321x SoC"
362 depends on X86_32
363 select M486
364 select X86_REBOOTFIXUPS
365 help
366 This option is needed for RDC R-321x system-on-chip, also known
367 as R-8610-(G).
368 If you don't have one of these chips, you should say N here.
369
370config SCHED_NO_NO_OMIT_FRAME_POINTER 370config SCHED_NO_NO_OMIT_FRAME_POINTER
371 def_bool y 371 def_bool y
372 prompt "Single-depth WCHAN output" 372 prompt "Single-depth WCHAN output"
@@ -448,7 +448,6 @@ config PARAVIRT_DEBUG
448 448
449config MEMTEST 449config MEMTEST
450 bool "Memtest" 450 bool "Memtest"
451 depends on X86_64
452 help 451 help
453 This option adds a kernel parameter 'memtest', which allows memtest 452 This option adds a kernel parameter 'memtest', which allows memtest
454 to be set. 453 to be set.
@@ -1278,6 +1277,14 @@ config CRASH_DUMP
1278 (CONFIG_RELOCATABLE=y). 1277 (CONFIG_RELOCATABLE=y).
1279 For more details see Documentation/kdump/kdump.txt 1278 For more details see Documentation/kdump/kdump.txt
1280 1279
1280config KEXEC_JUMP
1281 bool "kexec jump (EXPERIMENTAL)"
1282 depends on EXPERIMENTAL
1283 depends on KEXEC && HIBERNATION && X86_32
1284 help
1285 Jump between original kernel and kexeced kernel and invoke
1286 code in physical address mode via KEXEC
1287
1281config PHYSICAL_START 1288config PHYSICAL_START
1282 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1289 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1283 default "0x1000000" if X86_NUMAQ 1290 default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index abff1b84ed5b..2c518fbc52ec 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
362 def_bool y 362 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368
369config X86_INTEL_USERCOPY 365config X86_INTEL_USERCOPY
370 def_bool y 366 def_bool y
371 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 367 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
@@ -418,4 +414,4 @@ config X86_MINIMUM_CPU_FAMILY
418 414
419config X86_DEBUGCTLMSR 415config X86_DEBUGCTLMSR
420 def_bool y 416 def_bool y
421 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 417 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ae36bfa814e5..092f019e033a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -287,7 +289,6 @@ config CPA_DEBUG
287 289
288config OPTIMIZE_INLINING 290config OPTIMIZE_INLINING
289 bool "Allow gcc to uninline functions marked 'inline'" 291 bool "Allow gcc to uninline functions marked 'inline'"
290 depends on BROKEN
291 help 292 help
292 This option determines if the kernel forces gcc to inline the functions 293 This option determines if the kernel forces gcc to inline the functions
293 developers have marked 'inline'. Doing so takes away freedom from gcc to 294 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
298 become the default in the future, until then this option is there to 299 become the default in the future, until then this option is there to
299 test gcc for this. 300 test gcc for this.
300 301
302 If unsure, say N.
303
301endmenu 304endmenu
302 305
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea654..f5631da585b6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# RDC R-321x subarch support
122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
124core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
125
126# default subarch .h files 121# default subarch .h files
127mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iinclude/asm-x86/mach-default
128 123
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index bc5553b496f7..9fea73706479 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -182,8 +182,6 @@ static unsigned outcnt;
182static int fill_inbuf(void); 182static int fill_inbuf(void);
183static void flush_window(void); 183static void flush_window(void);
184static void error(char *m); 184static void error(char *m);
185static void gzip_mark(void **);
186static void gzip_release(void **);
187 185
188/* 186/*
189 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
@@ -196,9 +194,6 @@ extern int input_len;
196 194
197static long bytes_out; 195static long bytes_out;
198 196
199static void *malloc(int size);
200static void free(void *where);
201
202static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
203static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
204 199
@@ -220,40 +215,6 @@ static int lines, cols;
220 215
221#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
222 217
223static void *malloc(int size)
224{
225 void *p;
226
227 if (size < 0)
228 error("Malloc error");
229 if (free_mem_ptr <= 0)
230 error("Memory error");
231
232 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
233
234 p = (void *)free_mem_ptr;
235 free_mem_ptr += size;
236
237 if (free_mem_ptr >= free_mem_end_ptr)
238 error("Out of memory");
239
240 return p;
241}
242
243static void free(void *where)
244{ /* Don't care */
245}
246
247static void gzip_mark(void **ptr)
248{
249 *ptr = (void *) free_mem_ptr;
250}
251
252static void gzip_release(void **ptr)
253{
254 free_mem_ptr = (memptr) *ptr;
255}
256
257static void scroll(void) 218static void scroll(void)
258{ 219{
259 int i; 220 int i;
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013b..d93cbc6464d0 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -167,9 +167,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 167 * Scan the BIOS-supported hard disks and query EDD
168 * information... 168 * information...
169 */ 169 */
170 get_edd_info(devno, &ei); 170 if (!get_edd_info(devno, &ei)
171 171 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 172 memcpy(edp, &ei, sizeof ei);
174 edp++; 173 edp++;
175 boot_params.eddbuf_entries++; 174 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59e..85a1cd8a8ff8 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9bc34e2033ec..4d73f53287b6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set 2047# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set 2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y 2049CONFIG_HAVE_ARCH_KGDB=y
2050# CONFIG_NONPROMISC_DEVMEM is not set 2050# CONFIG_STRICT_DEVMEM is not set
2051CONFIG_EARLY_PRINTK=y 2051CONFIG_EARLY_PRINTK=y
2052CONFIG_DEBUG_STACKOVERFLOW=y 2052CONFIG_DEBUG_STACKOVERFLOW=y
2053CONFIG_DEBUG_STACK_USAGE=y 2053CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ae5124e064d4..a40452429625 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set 2012# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set 2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y 2014CONFIG_HAVE_ARCH_KGDB=y
2015# CONFIG_NONPROMISC_DEVMEM is not set 2015# CONFIG_STRICT_DEVMEM is not set
2016CONFIG_EARLY_PRINTK=y 2016CONFIG_EARLY_PRINTK=y
2017CONFIG_DEBUG_STACKOVERFLOW=y 2017CONFIG_DEBUG_STACKOVERFLOW=y
2018CONFIG_DEBUG_STACK_USAGE=y 2018CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..a0e1dbe67dc1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 441 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 443 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 444 return 0;
451} 445}
452 446
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..20af4c79579a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 253 regs->ss |= 3;
249 254
250 err |= __get_user(tmpflags, &sc->flags); 255 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 256 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 257 /* disable syscall checks */
253 regs->orig_ax = -1; 258 regs->orig_ax = -1;
254 259
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 520 compat_sigset_t *set, struct pt_regs *regs)
516{ 521{
517 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 523 void __user *restorer;
520 int err = 0; 524 int err = 0;
521 525
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 543 goto give_sigsegv;
540 544
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 545 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 548 err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..ffc1bb4fed7d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -37,6 +47,11 @@
37 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
38 .endm 48 .endm
39 49
50 /*
51 * Reload arg registers from stack in case ptrace changed them.
52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup.
54 */
40 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset
41 movl \offset(%rsp),%r11d 56 movl \offset(%rsp),%r11d
42 movl \offset+8(%rsp),%r10d 57 movl \offset+8(%rsp),%r10d
@@ -46,7 +61,6 @@
46 movl \offset+48(%rsp),%edx 61 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 62 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 63 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 64 .endm
51 65
52 .macro CFI_STARTPROC32 simple 66 .macro CFI_STARTPROC32 simple
@@ -137,21 +151,22 @@ ENTRY(ia32_sysenter_target)
137 .previous 151 .previous
138 GET_THREAD_INFO(%r10) 152 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,TI_status(%r10) 153 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 154 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
141 TI_flags(%r10)
142 CFI_REMEMBER_STATE 155 CFI_REMEMBER_STATE
143 jnz sysenter_tracesys 156 jnz sysenter_tracesys
144sysenter_do_call:
145 cmpl $(IA32_NR_syscalls-1),%eax 157 cmpl $(IA32_NR_syscalls-1),%eax
146 ja ia32_badsys 158 ja ia32_badsys
159sysenter_do_call:
147 IA32_ARG_FIXUP 1 160 IA32_ARG_FIXUP 1
161sysenter_dispatch:
148 call *ia32_sys_call_table(,%rax,8) 162 call *ia32_sys_call_table(,%rax,8)
149 movq %rax,RAX-ARGOFFSET(%rsp) 163 movq %rax,RAX-ARGOFFSET(%rsp)
150 GET_THREAD_INFO(%r10) 164 GET_THREAD_INFO(%r10)
151 DISABLE_INTERRUPTS(CLBR_NONE) 165 DISABLE_INTERRUPTS(CLBR_NONE)
152 TRACE_IRQS_OFF 166 TRACE_IRQS_OFF
153 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 167 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
154 jnz int_ret_from_sys_call 168 jnz sysexit_audit
169sysexit_from_sys_call:
155 andl $~TS_COMPAT,TI_status(%r10) 170 andl $~TS_COMPAT,TI_status(%r10)
156 /* clear IF, that popfq doesn't enable interrupts early */ 171 /* clear IF, that popfq doesn't enable interrupts early */
157 andl $~0x200,EFLAGS-R11(%rsp) 172 andl $~0x200,EFLAGS-R11(%rsp)
@@ -167,9 +182,63 @@ sysenter_do_call:
167 TRACE_IRQS_ON 182 TRACE_IRQS_ON
168 ENABLE_INTERRUPTS_SYSEXIT32 183 ENABLE_INTERRUPTS_SYSEXIT32
169 184
170sysenter_tracesys: 185#ifdef CONFIG_AUDITSYSCALL
186 .macro auditsys_entry_common
187 movl %esi,%r9d /* 6th arg: 4th syscall arg */
188 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
189 /* (already in %ecx) 4th arg: 2nd syscall arg */
190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
191 movl %eax,%esi /* 2nd arg: syscall number */
192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
193 call audit_syscall_entry
194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
195 cmpl $(IA32_NR_syscalls-1),%eax
196 ja ia32_badsys
197 movl %ebx,%edi /* reload 1st syscall arg */
198 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
199 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
200 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
201 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
202 .endm
203
204 .macro auditsys_exit exit,ebpsave=RBP
205 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
206 jnz int_ret_from_sys_call
207 TRACE_IRQS_ON
208 sti
209 movl %eax,%esi /* second arg, syscall return value */
210 cmpl $0,%eax /* is it < 0? */
211 setl %al /* 1 if so, 0 if not */
212 movzbl %al,%edi /* zero-extend that into %edi */
213 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
214 call audit_syscall_exit
215 GET_THREAD_INFO(%r10)
216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
217 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
218 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
219 cli
220 TRACE_IRQS_OFF
221 testl %edi,TI_flags(%r10)
222 jnz int_with_check
223 jmp \exit
224 .endm
225
226sysenter_auditsys:
171 CFI_RESTORE_STATE 227 CFI_RESTORE_STATE
228 auditsys_entry_common
229 movl %ebp,%r9d /* reload 6th syscall arg */
230 jmp sysenter_dispatch
231
232sysexit_audit:
233 auditsys_exit sysexit_from_sys_call
234#endif
235
236sysenter_tracesys:
172 xchgl %r9d,%ebp 237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys
241#endif
173 SAVE_REST 242 SAVE_REST
174 CLEAR_RREGS 243 CLEAR_RREGS
175 movq %r9,R9(%rsp) 244 movq %r9,R9(%rsp)
@@ -242,21 +311,22 @@ ENTRY(ia32_cstar_target)
242 .previous 311 .previous
243 GET_THREAD_INFO(%r10) 312 GET_THREAD_INFO(%r10)
244 orl $TS_COMPAT,TI_status(%r10) 313 orl $TS_COMPAT,TI_status(%r10)
245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
246 TI_flags(%r10)
247 CFI_REMEMBER_STATE 315 CFI_REMEMBER_STATE
248 jnz cstar_tracesys 316 jnz cstar_tracesys
249cstar_do_call: 317cstar_do_call:
250 cmpl $IA32_NR_syscalls-1,%eax 318 cmpl $IA32_NR_syscalls-1,%eax
251 ja ia32_badsys 319 ja ia32_badsys
252 IA32_ARG_FIXUP 1 320 IA32_ARG_FIXUP 1
321cstar_dispatch:
253 call *ia32_sys_call_table(,%rax,8) 322 call *ia32_sys_call_table(,%rax,8)
254 movq %rax,RAX-ARGOFFSET(%rsp) 323 movq %rax,RAX-ARGOFFSET(%rsp)
255 GET_THREAD_INFO(%r10) 324 GET_THREAD_INFO(%r10)
256 DISABLE_INTERRUPTS(CLBR_NONE) 325 DISABLE_INTERRUPTS(CLBR_NONE)
257 TRACE_IRQS_OFF 326 TRACE_IRQS_OFF
258 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
259 jnz int_ret_from_sys_call 328 jnz sysretl_audit
329sysretl_from_sys_call:
260 andl $~TS_COMPAT,TI_status(%r10) 330 andl $~TS_COMPAT,TI_status(%r10)
261 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 331 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
262 movl RIP-ARGOFFSET(%rsp),%ecx 332 movl RIP-ARGOFFSET(%rsp),%ecx
@@ -268,8 +338,23 @@ cstar_do_call:
268 CFI_RESTORE rsp 338 CFI_RESTORE rsp
269 USERGS_SYSRET32 339 USERGS_SYSRET32
270 340
271cstar_tracesys: 341#ifdef CONFIG_AUDITSYSCALL
342cstar_auditsys:
272 CFI_RESTORE_STATE 343 CFI_RESTORE_STATE
344 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
345 auditsys_entry_common
346 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
347 jmp cstar_dispatch
348
349sysretl_audit:
350 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
351#endif
352
353cstar_tracesys:
354#ifdef CONFIG_AUDITSYSCALL
355 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
356 jz cstar_auditsys
357#endif
273 xchgl %r9d,%ebp 358 xchgl %r9d,%ebp
274 SAVE_REST 359 SAVE_REST
275 CLEAR_RREGS 360 CLEAR_RREGS
@@ -321,6 +406,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 406 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 407 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 408 CFI_REL_OFFSET rip,RIP-RIP
409 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 410 SWAPGS
325 /* 411 /*
326 * No need to follow this irqs on/off section: the syscall 412 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +422,7 @@ ENTRY(ia32_syscall)
336 SAVE_ARGS 0,0,1 422 SAVE_ARGS 0,0,1
337 GET_THREAD_INFO(%r10) 423 GET_THREAD_INFO(%r10)
338 orl $TS_COMPAT,TI_status(%r10) 424 orl $TS_COMPAT,TI_status(%r10)
339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
340 TI_flags(%r10)
341 jnz ia32_tracesys 426 jnz ia32_tracesys
342ia32_do_syscall: 427ia32_do_syscall:
343 cmpl $(IA32_NR_syscalls-1),%eax 428 cmpl $(IA32_NR_syscalls-1),%eax
@@ -741,4 +826,10 @@ ia32_sys_call_table:
741 .quad sys32_fallocate 826 .quad sys32_fallocate
742 .quad compat_sys_timerfd_settime /* 325 */ 827 .quad compat_sys_timerfd_settime /* 325 */
743 .quad compat_sys_timerfd_gettime 828 .quad compat_sys_timerfd_gettime
829 .quad compat_sys_signalfd4
830 .quad sys_eventfd2
831 .quad sys_epoll_create1
832 .quad sys_dup3 /* 330 */
833 .quad sys_pipe2
834 .quad sys_inotify_init1
744ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..d3c64088b981 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 238 int retval;
239 int fds[2]; 239 int fds[2];
240 240
241 retval = do_pipe(fds); 241 retval = do_pipe_flags(fds, 0);
242 if (retval) 242 if (retval)
243 goto out; 243 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 244 if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..3db651fc8ec5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
13endif 14endif
14 15
15# 16#
@@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o
102# 64 bit specific files 103# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 104ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 108 obj-$(CONFIG_AUDIT) += audit_64.o
107 109
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..fa88a1d71290 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif 1022#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci); 1023 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); 1024 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025 1025
1026#ifdef CONFIG_X86_ES7000 1026#ifdef CONFIG_X86_ES7000
1027 /* 1027 /*
@@ -1127,8 +1127,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1127 return gsi; 1127 return gsi;
1128 } 1128 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1130 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32 1132#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else 1134#else
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa83..9220cf46aa10 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,6 +73,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
73 struct cpuinfo_x86 *c = &cpu_data(cpu); 73 struct cpuinfo_x86 *c = &cpu_data(cpu);
74 74
75 cpumask_t saved_mask; 75 cpumask_t saved_mask;
76 cpumask_of_cpu_ptr(new_mask, cpu);
76 int retval; 77 int retval;
77 unsigned int eax, ebx, ecx, edx; 78 unsigned int eax, ebx, ecx, edx;
78 unsigned int edx_part; 79 unsigned int edx_part;
@@ -91,7 +92,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
91 92
92 /* Make sure we are running on right CPU */ 93 /* Make sure we are running on right CPU */
93 saved_mask = current->cpus_allowed; 94 saved_mask = current->cpus_allowed;
94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 95 retval = set_cpus_allowed_ptr(current, new_mask);
95 if (retval) 96 if (retval)
96 return -1; 97 return -1;
97 98
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..fa2161d5003b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
23static char temp_stack[10240]; 24static char temp_stack[10240];
24#endif 25#endif
25 26
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 27/**
36 * acpi_save_state_mem - save kernel state 28 * acpi_save_state_mem - save kernel state
37 * 29 *
@@ -158,6 +150,10 @@ static int __init acpi_sleep_setup(char *str)
158 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
159 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
160 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
161 if (strncmp(str, "old_ordering", 12) == 0) 157 if (strncmp(str, "old_ordering", 12) == 0)
162 acpi_old_suspend_ordering(); 158 acpi_old_suspend_ordering();
163 str = strchr(str, ','); 159 str = strchr(str, ',');
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..74697408576f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,7 +23,7 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
@@ -32,21 +32,37 @@
32#define to_pages(addr, size) \ 32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 34
35#define EXIT_LOOP_COUNT 10000000
36
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 37static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 38
37struct command { 39/*
40 * general struct to manage commands send to an IOMMU
41 */
42struct iommu_cmd {
38 u32 data[4]; 43 u32 data[4];
39}; 44};
40 45
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 46static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 47 struct unity_map_entry *e);
43 48
49/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 50static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 51{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 52 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 53}
48 54
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 55/****************************************************************************
56 *
57 * IOMMU command queuing functions
58 *
59 ****************************************************************************/
60
61/*
62 * Writes the command to the IOMMUs command buffer and informs the
63 * hardware about the new command. Must be called with iommu->lock held.
64 */
65static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 66{
51 u32 tail, head; 67 u32 tail, head;
52 u8 *target; 68 u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 79 return 0;
64} 80}
65 81
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 82/*
83 * General queuing function for commands. Takes iommu->lock and calls
84 * __iommu_queue_command().
85 */
86static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 87{
68 unsigned long flags; 88 unsigned long flags;
69 int ret; 89 int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 95 return ret;
76} 96}
77 97
98/*
99 * This function is called whenever we need to ensure that the IOMMU has
100 * completed execution of all commands we sent. It sends a
101 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
102 * us about that by writing a value to a physical address we pass with
103 * the command.
104 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 105static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 106{
80 int ret; 107 int ret;
81 struct command cmd; 108 struct iommu_cmd cmd;
82 volatile u64 ready = 0; 109 volatile u64 ready = 0;
83 unsigned long ready_phys = virt_to_phys(&ready); 110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
84 112
85 memset(&cmd, 0, sizeof(cmd)); 113 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys); 115 cmd.data[1] = upper_32_bits(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */ 116 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 118
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
95 if (ret) 123 if (ret)
96 return ret; 124 return ret;
97 125
98 while (!ready) 126 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i;
99 cpu_relax(); 128 cpu_relax();
129 }
130
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
100 133
101 return 0; 134 return 0;
102} 135}
103 136
137/*
138 * Command send function for invalidating a device table entry
139 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 141{
106 struct command cmd; 142 struct iommu_cmd cmd;
107 143
108 BUG_ON(iommu == NULL); 144 BUG_ON(iommu == NULL);
109 145
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
116 return iommu_queue_command(iommu, &cmd); 152 return iommu_queue_command(iommu, &cmd);
117} 153}
118 154
155/*
156 * Generic command send function for invalidaing TLB entries
157 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 158static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 159 u64 address, u16 domid, int pde, int s)
121{ 160{
122 struct command cmd; 161 struct iommu_cmd cmd;
123 162
124 memset(&cmd, 0, sizeof(cmd)); 163 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 164 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 166 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 167 cmd.data[2] = LOW_U32(address);
129 cmd.data[3] = HIGH_U32(address); 168 cmd.data[3] = upper_32_bits(address);
130 if (s) 169 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 173
135 iommu->need_sync = 1; 174 iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
137 return iommu_queue_command(iommu, &cmd); 176 return iommu_queue_command(iommu, &cmd);
138} 177}
139 178
179/*
180 * TLB invalidation function which is called from the mapping functions.
181 * It invalidates a single PTE if the range to flush is within a single
182 * page. Otherwise it flushes the whole TLB of the IOMMU.
183 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 184static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 185 u64 address, size_t size)
142{ 186{
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 203 return 0;
160} 204}
161 205
206/****************************************************************************
207 *
208 * The functions below are used the create the page table mappings for
209 * unity mapped regions.
210 *
211 ****************************************************************************/
212
213/*
214 * Generic mapping functions. It maps a physical address into a DMA
215 * address space. It allocates the page table pages if necessary.
216 * In the future it can be extended to a generic mapping function
217 * supporting all features of AMD IOMMU page tables like level skipping
218 * and full 64 bit address spaces.
219 */
162static int iommu_map(struct protection_domain *dom, 220static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 221 unsigned long bus_addr,
164 unsigned long phys_addr, 222 unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 267 return 0;
210} 268}
211 269
270/*
271 * This function checks if a specific unity mapping entry is needed for
272 * this specific IOMMU.
273 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 274static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 275 struct unity_map_entry *entry)
214{ 276{
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 285 return 0;
224} 286}
225 287
288/*
289 * Init the unity mappings for a specific IOMMU in the system
290 *
291 * Basically iterates over all unity mapping entries and applies them to
292 * the default domain DMA of that IOMMU if necessary.
293 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 294static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 295{
228 struct unity_map_entry *entry; 296 struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 307 return 0;
240} 308}
241 309
310/*
311 * This function actually applies the mapping to the page table of the
312 * dma_ops domain.
313 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 314static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 315 struct unity_map_entry *e)
244{ 316{
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 333 return 0;
262} 334}
263 335
336/*
337 * Inits the unity mappings required for a specific device
338 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 339static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 340 u16 devid)
266{ 341{
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 353 return 0;
279} 354}
280 355
356/****************************************************************************
357 *
358 * The next functions belong to the address allocator for the dma_ops
359 * interface functions. They work like the allocators in the other IOMMU
360 * drivers. Its basically a bitmap which marks the allocated pages in
361 * the aperture. Maybe it could be enhanced in the future to a more
362 * efficient allocator.
363 *
364 ****************************************************************************/
281static unsigned long dma_mask_to_pages(unsigned long mask) 365static unsigned long dma_mask_to_pages(unsigned long mask)
282{ 366{
283 return (mask >> PAGE_SHIFT) + 367 return (mask >> PAGE_SHIFT) +
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
285} 369}
286 370
371/*
372 * The address allocator core function.
373 *
374 * called with domain->lock held
375 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 376static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 377 struct dma_ops_domain *dom,
289 unsigned int pages) 378 unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 406 return address;
318} 407}
319 408
409/*
410 * The address free function.
411 *
412 * called with domain->lock held
413 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 414static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 415 unsigned long address,
322 unsigned int pages) 416 unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 419 iommu_area_free(dom->bitmap, address, pages);
326} 420}
327 421
422/****************************************************************************
423 *
424 * The next functions belong to the domain allocation. A domain is
425 * allocated for every IOMMU as the default domain. If device isolation
426 * is enabled, every device get its own domain. The most important thing
427 * about domains is the page table mapping the DMA address space they
428 * contain.
429 *
430 ****************************************************************************/
431
328static u16 domain_id_alloc(void) 432static u16 domain_id_alloc(void)
329{ 433{
330 unsigned long flags; 434 unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
342 return id; 446 return id;
343} 447}
344 448
449/*
450 * Used to reserve address ranges in the aperture (e.g. for exclusion
451 * ranges.
452 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 453static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 454 unsigned long start_page,
347 unsigned int pages) 455 unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 490 free_page((unsigned long)p1);
383} 491}
384 492
493/*
494 * Free a domain, only used if something went wrong in the
495 * allocation path and we need to free an already allocated page table
496 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 497static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 498{
387 if (!dom) 499 if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 508 kfree(dom);
397} 509}
398 510
511/*
512 * Allocates a new protection domain usable for the dma_ops functions.
513 * It also intializes the page table and the address allocator data
514 * structures required for the dma_ops interface
515 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 516static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 517 unsigned order)
401{ 518{
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 553 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 554 dma_dom->next_bit = 0;
438 555
556 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 557 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 558 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 562 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 563 }
446 564
565 /*
566 * At the last step, build the page tables so we don't need to
567 * allocate page table pages in the dma_ops mapping/unmapping
568 * path.
569 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 570 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 571 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 572 GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
472 return NULL; 595 return NULL;
473} 596}
474 597
598/*
599 * Find out the protection domain structure for a given PCI device. This
600 * will give us the pointer to the page table root for example.
601 */
475static struct protection_domain *domain_for_device(u16 devid) 602static struct protection_domain *domain_for_device(u16 devid)
476{ 603{
477 struct protection_domain *dom; 604 struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 611 return dom;
485} 612}
486 613
614/*
615 * If a device is not yet associated with a domain, this function does
616 * assigns it visible for the hardware
617 */
487static void set_device_domain(struct amd_iommu *iommu, 618static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 619 struct protection_domain *domain,
489 u16 devid) 620 u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 639 iommu->need_sync = 1;
509} 640}
510 641
642/*****************************************************************************
643 *
644 * The next functions belong to the dma_ops mapping/unmapping code.
645 *
646 *****************************************************************************/
647
648/*
649 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device.
652 * If the device is not yet associated with a domain this is also done
653 * in this function.
654 */
511static int get_device_resources(struct device *dev, 655static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 656 struct amd_iommu **iommu,
513 struct protection_domain **domain, 657 struct protection_domain **domain,
@@ -520,9 +664,10 @@ static int get_device_resources(struct device *dev,
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
521 665
522 pcidev = to_pci_dev(dev); 666 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 668
525 if (_bdf >= amd_iommu_last_bdf) { 669 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) {
526 *iommu = NULL; 671 *iommu = NULL;
527 *domain = NULL; 672 *domain = NULL;
528 *bdf = 0xffff; 673 *bdf = 0xffff;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 692 return 1;
548} 693}
549 694
695/*
696 * This is the generic map function. It maps one 4kb page at paddr to
697 * the given address in the DMA address space for the domain.
698 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 699static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 700 struct dma_ops_domain *dom,
552 unsigned long address, 701 unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 727 return (dma_addr_t)address;
579} 728}
580 729
730/*
731 * The generic unmapping function for on page in the DMA address space.
732 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 733static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 734 struct dma_ops_domain *dom,
583 unsigned long address) 735 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 749 *pte = 0ULL;
598} 750}
599 751
752/*
753 * This function contains common code for mapping of a physically
754 * contiguous memory region into DMA address space. It is uses by all
755 * mapping functions provided by this IOMMU driver.
756 * Must be called with the domain lock held.
757 */
600static dma_addr_t __map_single(struct device *dev, 758static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 759 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 760 struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
628 return address; 786 return address;
629} 787}
630 788
789/*
790 * Does the reverse of the __map_single function. Must be called with
791 * the domain lock held too
792 */
631static void __unmap_single(struct amd_iommu *iommu, 793static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 794 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 795 dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 814 dma_ops_free_addresses(dma_dom, dma_addr, pages);
653} 815}
654 816
817/*
818 * The exported map_single function for dma_ops.
819 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 820static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 821 size_t size, int dir)
657{ 822{
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
664 get_device_resources(dev, &iommu, &domain, &devid); 829 get_device_resources(dev, &iommu, &domain, &devid);
665 830
666 if (iommu == NULL || domain == NULL) 831 if (iommu == NULL || domain == NULL)
832 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 833 return (dma_addr_t)paddr;
668 834
669 spin_lock_irqsave(&domain->lock, flags); 835 spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
683 return addr; 849 return addr;
684} 850}
685 851
852/*
853 * The exported unmap_single function for dma_ops.
854 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 855static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 856 size_t size, int dir)
688{ 857{
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
692 u16 devid; 861 u16 devid;
693 862
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 863 if (!get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */
695 return; 865 return;
696 866
697 spin_lock_irqsave(&domain->lock, flags); 867 spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
706 spin_unlock_irqrestore(&domain->lock, flags); 876 spin_unlock_irqrestore(&domain->lock, flags);
707} 877}
708 878
879/*
880 * This is a special map_sg function which is used if we should map a
881 * device which is not handled by an AMD IOMMU in the system.
882 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 883static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 884 int nelems, int dir)
711{ 885{
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 894 return nelems;
721} 895}
722 896
897/*
898 * The exported map_sg function for dma_ops (handles scatter-gather
899 * lists).
900 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 901static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 902 int nelems, int dir)
725{ 903{
@@ -775,6 +953,10 @@ unmap:
775 goto out; 953 goto out;
776} 954}
777 955
956/*
957 * The exported map_sg function for dma_ops (handles scatter-gather
958 * lists).
959 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 960static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 961 int nelems, int dir)
780{ 962{
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
804 spin_unlock_irqrestore(&domain->lock, flags); 986 spin_unlock_irqrestore(&domain->lock, flags);
805} 987}
806 988
989/*
990 * The exported alloc_coherent function for dma_ops.
991 */
807static void *alloc_coherent(struct device *dev, size_t size, 992static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 993 dma_addr_t *dma_addr, gfp_t flag)
809{ 994{
@@ -851,6 +1036,11 @@ out:
851 return virt_addr; 1036 return virt_addr;
852} 1037}
853 1038
1039/*
1040 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */
854static void free_coherent(struct device *dev, size_t size, 1044static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1045 void *virt_addr, dma_addr_t dma_addr)
856{ 1046{
@@ -879,6 +1069,8 @@ free_mem:
879} 1069}
880 1070
881/* 1071/*
1072 * The function for pre-allocating protection domains.
1073 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1074 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1075 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1076 * For now we have to.
@@ -893,7 +1085,7 @@ void prealloc_protection_domains(void)
893 1085
894 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1086 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
895 devid = (dev->bus->number << 8) | dev->devfn; 1087 devid = (dev->bus->number << 8) | dev->devfn;
896 if (devid >= amd_iommu_last_bdf) 1088 if (devid > amd_iommu_last_bdf)
897 continue; 1089 continue;
898 devid = amd_iommu_alias_table[devid]; 1090 devid = amd_iommu_alias_table[devid];
899 if (domain_for_device(devid)) 1091 if (domain_for_device(devid))
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
921 .unmap_sg = unmap_sg, 1113 .unmap_sg = unmap_sg,
922}; 1114};
923 1115
1116/*
1117 * The function which clues the AMD IOMMU driver into dma_ops.
1118 */
924int __init amd_iommu_init_dma_ops(void) 1119int __init amd_iommu_init_dma_ops(void)
925{ 1120{
926 struct amd_iommu *iommu; 1121 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1122 int order = amd_iommu_aperture_order;
928 int ret; 1123 int ret;
929 1124
1125 /*
1126 * first allocate a default protection domain for every IOMMU we
1127 * found in the system. Devices not assigned to any other
1128 * protection domain will be assigned to the default one.
1129 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1130 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1131 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1132 if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1136 goto free_domains;
937 } 1137 }
938 1138
1139 /*
1140 * If device isolation is enabled, pre-allocate the protection
1141 * domains for each device.
1142 */
939 if (amd_iommu_isolate) 1143 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1144 prealloc_protection_domains();
941 1145
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1151 gart_iommu_aperture = 0;
948#endif 1152#endif
949 1153
1154 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1155 dma_ops = &amd_iommu_dma_ops;
951 1156
952 return 0; 1157 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..d9a9da597e79 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 27#include <asm/amd_iommu.h>
28#include <asm/gart.h> 28#include <asm/iommu.h>
29 29
30/* 30/*
31 * definitions for the ACPI scanning code 31 * definitions for the ACPI scanning code
32 */ 32 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff) 33#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 34#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 35
43#define ACPI_IVHD_TYPE 0x10 36#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 37#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +64,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 64#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 65#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
74struct ivhd_header { 78struct ivhd_header {
75 u8 type; 79 u8 type;
76 u8 flags; 80 u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
83 u32 reserved; 87 u32 reserved;
84} __attribute__((packed)); 88} __attribute__((packed));
85 89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
86struct ivhd_entry { 94struct ivhd_entry {
87 u8 type; 95 u8 type;
88 u16 devid; 96 u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
90 u32 ext; 98 u32 ext;
91} __attribute__((packed)); 99} __attribute__((packed));
92 100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
93struct ivmd_header { 105struct ivmd_header {
94 u8 type; 106 u8 type;
95 u8 flags; 107 u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
103 115
104static int __initdata amd_iommu_detected; 116static int __initdata amd_iommu_detected;
105 117
106u16 amd_iommu_last_bdf; 118u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 119 to handle */
108unsigned amd_iommu_aperture_order = 26; 120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
110 127
111struct list_head amd_iommu_list; 128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
112struct dev_table_entry *amd_iommu_dev_table; 134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
113u16 *amd_iommu_alias_table; 141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
114struct amd_iommu **amd_iommu_rlookup_table; 147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
115struct protection_domain **amd_iommu_pd_table; 153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 159unsigned long *amd_iommu_pd_alloc_bitmap;
117 160
118static u32 dev_table_size; 161static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 162static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 163static u32 rlookup_table_size; /* size if the rlookup table */
121 164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 193{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 194 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 207 &entry, sizeof(entry));
138} 208}
139 209
210/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 211static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 212{
142 u32 entry; 213 u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 220 &entry, sizeof(entry));
150} 221}
151 222
223/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 225{
154 u32 ctrl; 226 u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 240}
169 241
242/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 243void __init iommu_enable(struct amd_iommu *iommu)
171{ 244{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 250}
178 251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
179static u8 * __init iommu_map_mmio_space(u64 address) 256static u8 * __init iommu_map_mmio_space(u64 address)
180{ 257{
181 u8 *ret; 258 u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 277}
201 278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 293{
204 u32 cap; 294 u32 cap;
205 295
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 298
209 return 0; 299 return 0;
210} 300}
211 301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 307{
214 u8 *p = (void *)h, *end = (void *)h; 308 u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 323 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 324 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 325 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
233 break; 328 break;
234 default: 329 default:
235 break; 330 break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 337 return 0;
243} 338}
244 339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 345static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 346{
247 int i; 347 int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 377 return 0;
278} 378}
279 379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 395{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 397 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 398 u64 entry;
285 399
286 if (cmd_buf == NULL) 400 if (cmd_buf == NULL)
287 return NULL; 401 return NULL;
288 402
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 404
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 405 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 406 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 414
303static void __init free_command_buffer(struct amd_iommu *iommu) 415static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 416{
305 if (iommu->cmd_buf) 417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
306 free_pages((unsigned long)iommu->cmd_buf,
307 get_order(CMD_BUFFER_SIZE));
308} 418}
309 419
420/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 421static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 422{
312 int i = (bit >> 5) & 0x07; 423 int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 427}
317 428
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
319{ 441{
320 if (flags & ACPI_DEVFLAG_INITPASS) 442 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 454 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 456
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 457 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 458}
340 459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 465{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 469 return;
347 470
348 if (iommu) { 471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 478 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 479 iommu->exclusion_length = m->range_length;
352 } 480 }
353} 481}
354 482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 489{
357 int bus = PCI_BUS(iommu->devid); 490 int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
364 497
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
368} 503}
369 504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 510 struct ivhd_header *h)
372{ 511{
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 513 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 515 u32 ext_flags = 0;
377 bool alias = 0; 516 bool alias = false;
378 struct ivhd_entry *e; 517 struct ivhd_entry *e;
379 518
380 /* 519 /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 553 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 554 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 555 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
418 break; 558 break;
419 case IVHD_DEV_SELECT: 559 case IVHD_DEV_SELECT:
420 devid = e->devid; 560 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 562 break;
423 case IVHD_DEV_SELECT_RANGE_START: 563 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 564 devid_start = e->devid;
425 flags = e->flags; 565 flags = e->flags;
426 ext_flags = 0; 566 ext_flags = 0;
427 alias = 0; 567 alias = false;
428 break; 568 break;
429 case IVHD_DEV_ALIAS: 569 case IVHD_DEV_ALIAS:
430 devid = e->devid; 570 devid = e->devid;
431 devid_to = e->ext >> 8; 571 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 573 amd_iommu_alias_table[devid] = devid_to;
434 break; 574 break;
435 case IVHD_DEV_ALIAS_RANGE: 575 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 577 flags = e->flags;
438 devid_to = e->ext >> 8; 578 devid_to = e->ext >> 8;
439 ext_flags = 0; 579 ext_flags = 0;
440 alias = 1; 580 alias = true;
441 break; 581 break;
442 case IVHD_DEV_EXT_SELECT: 582 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 583 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
445 break; 586 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 587 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 588 devid_start = e->devid;
448 flags = e->flags; 589 flags = e->flags;
449 ext_flags = e->ext; 590 ext_flags = e->ext;
450 alias = 0; 591 alias = false;
451 break; 592 break;
452 case IVHD_DEV_RANGE_END: 593 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 594 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 596 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 597 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 598 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 599 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 600 flags, ext_flags);
460 } 601 }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
467 } 608 }
468} 609}
469 610
611/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 612static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 613{
472 u16 i; 614 u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
494 } 636 }
495} 637}
496 638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 645{
499 spin_lock_init(&iommu->lock); 646 spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
521 return 0; 668 return 0;
522} 669}
523 670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
524static int __init init_iommu_all(struct acpi_table_header *table) 675static int __init init_iommu_all(struct acpi_table_header *table)
525{ 676{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 677 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 679 struct amd_iommu *iommu;
529 int ret; 680 int ret;
530 681
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 682 end += table->length;
534 p += IVRS_HEADER_LENGTH; 683 p += IVRS_HEADER_LENGTH;
535 684
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 704 return 0;
556} 705}
557 706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
558static void __init free_unity_maps(void) 715static void __init free_unity_maps(void)
559{ 716{
560 struct unity_map_entry *entry, *next; 717 struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
565 } 722 }
566} 723}
567 724
725/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 726static int __init init_exclusion_range(struct ivmd_header *m)
569{ 727{
570 int i; 728 int i;
@@ -574,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
574 set_device_exclusion_range(m->devid, m); 732 set_device_exclusion_range(m->devid, m);
575 break; 733 break;
576 case ACPI_IVMD_TYPE_ALL: 734 case ACPI_IVMD_TYPE_ALL:
577 for (i = 0; i < amd_iommu_last_bdf; ++i) 735 for (i = 0; i <= amd_iommu_last_bdf; ++i)
578 set_device_exclusion_range(i, m); 736 set_device_exclusion_range(i, m);
579 break; 737 break;
580 case ACPI_IVMD_TYPE_RANGE: 738 case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 746 return 0;
589} 747}
590 748
749/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 750static int __init init_unity_map_range(struct ivmd_header *m)
592{ 751{
593 struct unity_map_entry *e = 0; 752 struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 778 return 0;
620} 779}
621 780
781/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 782static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 783{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 784 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 785 struct ivmd_header *m;
626 786
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 787 end += table->length;
630 p += IVRS_HEADER_LENGTH; 788 p += IVRS_HEADER_LENGTH;
631 789
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 800 return 0;
643} 801}
644 802
803/*
804 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized
806 */
645static void __init enable_iommus(void) 807static void __init enable_iommus(void)
646{ 808{
647 struct amd_iommu *iommu; 809 struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 840 .cls = &amd_iommu_sysdev_class,
679}; 841};
680 842
843/*
844 * This is the core init function for AMD IOMMU hardware in the system.
845 * This function is called from the generic x86 DMA layer initialization
846 * code.
847 *
848 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
849 * three times:
850 *
851 * 1 pass) Find the highest PCI device id the driver has to handle.
852 * Upon this information the size of the data structures is
853 * determined that needs to be allocated.
854 *
855 * 2 pass) Initialize the data structures just allocated with the
856 * information in the ACPI table about available AMD IOMMUs
857 * in the system. It also maps the PCI devices in the
858 * system to specific IOMMUs
859 *
860 * 3 pass) After the basic data structures are allocated and
861 * initialized we update them with information about memory
862 * remapping requirements parsed out of the ACPI table in
863 * this last pass.
864 *
865 * After that the hardware is initialized and ready to go. In the last
866 * step we do some Linux specific things like registering the driver in
867 * the dma_ops interface and initializing the suspend/resume support
868 * functions. Finally it prints some information about AMD IOMMUs and
869 * the driver state and enables the hardware.
870 */
681int __init amd_iommu_init(void) 871int __init amd_iommu_init(void)
682{ 872{
683 int i, ret = 0; 873 int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 889 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 890 return -ENODEV;
701 891
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 892 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 893 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 894 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 895
706 ret = -ENOMEM; 896 ret = -ENOMEM;
707 897
708 /* Device table - directly used by all IOMMUs */ 898 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 899 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 900 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 901 if (amd_iommu_dev_table == NULL)
712 goto out; 902 goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 920 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 921 * This table has the same size as the rlookup_table
732 */ 922 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 923 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 924 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 925 if (amd_iommu_pd_table == NULL)
736 goto free; 926 goto free;
737 927
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 928 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
929 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 930 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 931 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 932 goto free;
742 933
743 /* 934 /*
744 * memory is allocated now; initialize the device table with all zeroes 935 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 936 */
747 memset(amd_iommu_dev_table, 0, dev_table_size); 937 for (i = 0; i <= amd_iommu_last_bdf; ++i)
748 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 938 amd_iommu_alias_table[i] = i;
750 939
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 940 /*
755 * never allocate domain 0 because its used as the non-allocated and 941 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 942 * error value placeholder
@@ -795,24 +981,19 @@ out:
795 return ret; 981 return ret;
796 982
797free: 983free:
798 if (amd_iommu_pd_alloc_bitmap) 984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
800 985
801 if (amd_iommu_pd_table) 986 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 987 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 988
805 if (amd_iommu_rlookup_table) 989 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 990 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 991
809 if (amd_iommu_alias_table) 992 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 993 get_order(alias_table_size));
811 get_order(alias_table_size));
812 994
813 if (amd_iommu_dev_table) 995 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 996 get_order(dev_table_size));
815 get_order(dev_table_size));
816 997
817 free_iommu_all(); 998 free_iommu_all();
818 999
@@ -821,6 +1002,13 @@ free:
821 goto out; 1002 goto out;
822} 1003}
823 1004
1005/****************************************************************************
1006 *
1007 * Early detect code. This code runs at IOMMU detection time in the DMA
1008 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1009 * IOMMUs
1010 *
1011 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1012static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1013{
826 return 0; 1014 return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1016
829void __init amd_iommu_detect(void) 1017void __init amd_iommu_detect(void)
830{ 1018{
831 if (swiotlb || no_iommu || iommu_detected) 1019 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1020 return;
833 1021
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1022 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
841 } 1029 }
842} 1030}
843 1031
1032/****************************************************************************
1033 *
1034 * Parsing functions for the AMD IOMMU specific kernel command line
1035 * options.
1036 *
1037 ****************************************************************************/
1038
844static int __init parse_amd_iommu_options(char *str) 1039static int __init parse_amd_iommu_options(char *str)
845{ 1040{
846 for (; *str; ++str) { 1041 for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1048
854static int __init parse_amd_iommu_size_options(char *str) 1049static int __init parse_amd_iommu_size_options(char *str)
855{ 1050{
856 for (; *str; ++str) { 1051 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1052
858 amd_iommu_aperture_order = 25; 1053 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1054 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1055
871 return 1; 1056 return 1;
872} 1057}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..44e21826db11 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a437d027f20b..d6c898358371 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 75/*
76 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
77 */ 77 */
78int apic_verbosity; 78unsigned int apic_verbosity;
79 79
80int pic_mode; 80int pic_mode;
81 81
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
177 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
178 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
181} 181}
182 182
183/** 183/**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
215 *
216 * We do reads before writes even if unnecessary, to get around the
217 * P5 APIC double write bug.
218 */ 215 */
219static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
220{ 217{
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 226 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
231 228
232 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
233 230
234 /* 231 /*
235 * Divide PICLK by 16 232 * Divide PICLK by 16
236 */ 233 */
237 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
241 238
242 if (!oneshot) 239 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 241}
245 242
246/* 243/*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 247 struct clock_event_device *evt)
251{ 248{
252 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
253 return 0; 250 return 0;
254} 251}
255 252
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
282 break; 279 break;
283 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 369 }
373} 370}
374 371
375/* 372static int __init calibrate_APIC_clock(void)
376 * Setup the boot APIC
377 *
378 * Calibrate and verify the result.
379 */
380void __init setup_boot_APIC_clock(void)
381{ 373{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
387 long delta, deltapm; 379 long delta, deltapm;
388 int pm_referenced = 0; 380 int pm_referenced = 0;
389 381
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 382 local_irq_disable();
409 383
410 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
491 465
492 local_apic_timer_verify_ok = 1;
493
494 /* 466 /*
495 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
496 */ 468 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 470 local_irq_enable();
499 printk(KERN_WARNING 471 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 473 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 474 }
506 475
476 local_apic_timer_verify_ok = 1;
477
507 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
508 if (!pm_referenced) { 479 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
543 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
544 printk(KERN_WARNING 515 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
546 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
548 return; 539 lapic_clockevent.mult = 1;
549 } else { 540 setup_APIC_timer();
550 /* 541 }
551 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy
553 * device.
554 */
555 if (nmi_watchdog != NMI_IO_APIC)
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
557 else
558 printk(KERN_WARNING "APIC timer registered as dummy,"
559 " due to nmi_watchdog=%d!\n", nmi_watchdog);
560 } 543 }
561 544
545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
546 "calibrating APIC timer ...\n");
547
548 if (calibrate_APIC_clock()) {
549 /* No broadcast on UP ! */
550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
553 }
554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
562 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 567 setup_APIC_timer();
564} 568}
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
693 */ 697 */
694 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 701 }
698 /* 702 /*
699 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 704 * any level-triggered sources.
701 */ 705 */
702 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 715 }
712 716
713 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 722 }
719#endif 723#endif
720 /* 724 /*
721 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
722 */ 726 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 730 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 732 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 734
731#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5) 736 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif 738#endif
735 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
756 */ 760 */
757 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
760 764
761 /* 765 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
865 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
866 870
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
869 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 874}
871 875
872/* 876/*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
902 else 906 else
903 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
906 910
907 /* 911 /*
908 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
909 */ 913 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
915} 919}
916 920
917static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
926 930
927 /* enables sending errors */ 931 /* enables sending errors */
928 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
929 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
930 /* 934 /*
931 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
932 */ 936 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 993 */
990 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
993 997
994 /* 998 /*
995 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1047 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1048 */ 1052 */
1049 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1051 1055
1052 /* 1056 /*
1053 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1074 smp_processor_id());
1071 } 1075 }
1072 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1073 1077
1074 /* 1078 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1084} 1088}
1085 1089
1086void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1091 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1092 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1094 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1095 1099
1096 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
1214 1218
1215int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1216{ 1220{
1217 if (disable_apic)
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1219
1220 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1222 return -1;
1222 1223
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1419 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1420 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1421 value |= 0xf; 1422 value |= 0xf;
1422 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1423 1424
1424 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1425 /* 1426 /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1436 } else { 1437 } else {
1437 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1439 } 1440 }
1440 1441
1441 /* 1442 /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1452 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1453 } 1454 }
1454} 1455}
1455 1456
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
1700static int __init parse_nolapic(char *arg) 1701static int __init parse_nolapic(char *arg)
1701{ 1702{
1702 disable_apic = 1; 1703 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1704 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 1705 return 0;
1705} 1706}
1706early_param("nolapic", parse_nolapic); 1707early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1e3d32e27c14..7f1f030da7ee 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58 58
59/* Have we found an MP table */ 59/* Have we found an MP table */
60int smp_found_config; 60int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
314 314
315#define TICK_COUNT 100000000 315#define TICK_COUNT 100000000
316 316
317static void __init calibrate_APIC_clock(void) 317static int __init calibrate_APIC_clock(void)
318{ 318{
319 unsigned apic, apic_start; 319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 320 unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 368 clockevent_delta2ns(0xF, &lapic_clockevent);
369 369
370 calibration_result = result / HZ; 370 calibration_result = result / HZ;
371
372 /*
373 * Do a sanity check on the APIC calibration result
374 */
375 if (calibration_result < (1000000 / HZ)) {
376 printk(KERN_WARNING
377 "APIC frequency too slow, disabling apic timer\n");
378 return -1;
379 }
380
381 return 0;
371} 382}
372 383
373/* 384/*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
394 } 405 }
395 406
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 407 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 408 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 409 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 410 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 411 setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
1337static __init int setup_disableapic(char *str) 1341static __init int setup_disableapic(char *str)
1338{ 1342{
1339 disable_apic = 1; 1343 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1344 setup_clear_cpu_cap(X86_FEATURE_APIC);
1341 return 0; 1345 return 0;
1342} 1346}
1343early_param("disableapic", setup_disableapic); 1347early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..9ee24e6bc4b0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..c639bd55391c
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..cae9cabc3031 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27int force_mwait __cpuinitdata;
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 28{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c9b58a806e85 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -131,13 +131,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 131 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 132 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 133 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 134 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 135
142static void __init check_config(void) 136static void __init check_config(void)
143{ 137{
@@ -151,21 +145,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 145 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 146 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 147#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 148}
170 149
171 150
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..dd6e3f15017e 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,15 +7,13 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kgdb.h> 8#include <linux/kgdb.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h> 12#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h> 13#include <asm/i387.h>
17#include <asm/msr.h> 14#include <asm/msr.h>
18#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
19#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 18#include <asm/mtrr.h>
21#include <asm/mce.h> 19#include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
305 c->x86_capability[2] = cpuid_edx(0x80860001); 303 c->x86_capability[2] = cpuid_edx(0x80860001);
306 } 304 }
307 305
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007) 306 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007); 307 c->x86_power = cpuid_edx(0x80000007);
311 308
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 313 c->x86_phys_bits = eax & 0xff;
317 } 314 }
318 315
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 317 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 318 cpu_devs[c->x86_vendor]->c_early_init(c);
325 319
326 validate_pat_support(c); 320 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331} 321}
332 322
333/* 323/*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
517} 507}
518 508
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 510 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 511
523extern asmlinkage void ignore_sysret(void); 512extern asmlinkage void ignore_sysret(void);
524 513
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..ff2fff56f0a8 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,10 +200,12 @@ static void drv_read(struct drv_cmd *cmd)
200static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
201{ 201{
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 cpumask_of_cpu_ptr_declare(cpu_mask);
203 unsigned int i; 204 unsigned int i;
204 205
205 for_each_cpu_mask(i, cmd->mask) { 206 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 207 cpumask_of_cpu_ptr_next(cpu_mask, i);
208 set_cpus_allowed_ptr(current, cpu_mask);
207 do_drv_write(cmd); 209 do_drv_write(cmd);
208 } 210 }
209 211
@@ -267,11 +269,12 @@ static unsigned int get_measured_perf(unsigned int cpu)
267 } aperf_cur, mperf_cur; 269 } aperf_cur, mperf_cur;
268 270
269 cpumask_t saved_mask; 271 cpumask_t saved_mask;
272 cpumask_of_cpu_ptr(cpu_mask, cpu);
270 unsigned int perf_percent; 273 unsigned int perf_percent;
271 unsigned int retval; 274 unsigned int retval;
272 275
273 saved_mask = current->cpus_allowed; 276 saved_mask = current->cpus_allowed;
274 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 277 set_cpus_allowed_ptr(current, cpu_mask);
275 if (get_cpu() != cpu) { 278 if (get_cpu() != cpu) {
276 /* We were not able to run on requested processor */ 279 /* We were not able to run on requested processor */
277 put_cpu(); 280 put_cpu();
@@ -337,6 +340,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
337 340
338static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 341static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
339{ 342{
343 cpumask_of_cpu_ptr(cpu_mask, cpu);
340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 344 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
341 unsigned int freq; 345 unsigned int freq;
342 unsigned int cached_freq; 346 unsigned int cached_freq;
@@ -349,7 +353,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
349 } 353 }
350 354
351 cached_freq = data->freq_table[data->acpi_data->state].frequency; 355 cached_freq = data->freq_table[data->acpi_data->state].frequency;
352 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); 356 freq = extract_freq(get_cur_val(cpu_mask), data);
353 if (freq != cached_freq) { 357 if (freq != cached_freq) {
354 /* 358 /*
355 * The dreaded BIOS frequency change behind our back. 359 * The dreaded BIOS frequency change behind our back.
@@ -451,7 +455,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 455
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 456 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 457 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 458 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 459 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 460 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 461 }
@@ -466,7 +470,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 470 }
467 } 471 }
468 472
469 for_each_cpu_mask(i, cmd.mask) { 473 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 474 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 475 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 476 }
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..f1685fb91fbd 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..53c7b6936973 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -479,11 +479,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
479static int check_supported_cpu(unsigned int cpu) 479static int check_supported_cpu(unsigned int cpu)
480{ 480{
481 cpumask_t oldmask; 481 cpumask_t oldmask;
482 cpumask_of_cpu_ptr(cpu_mask, cpu);
482 u32 eax, ebx, ecx, edx; 483 u32 eax, ebx, ecx, edx;
483 unsigned int rc = 0; 484 unsigned int rc = 0;
484 485
485 oldmask = current->cpus_allowed; 486 oldmask = current->cpus_allowed;
486 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 487 set_cpus_allowed_ptr(current, cpu_mask);
487 488
488 if (smp_processor_id() != cpu) { 489 if (smp_processor_id() != cpu) {
489 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); 490 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 967 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 968 freqs.new = find_khz_freq_from_fid(fid);
968 969
969 for_each_cpu_mask(i, *(data->available_cores)) { 970 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 971 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 972 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 973 }
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 975 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 976 freqs.new = find_khz_freq_from_fid(data->currfid);
976 977
977 for_each_cpu_mask(i, *(data->available_cores)) { 978 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 979 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 980 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 981 }
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 998 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 999 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1000
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1002 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1004 }
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1006 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1007 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1008
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1010 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1012 }
@@ -1016,6 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1016static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1017{ 1018{
1018 cpumask_t oldmask; 1019 cpumask_t oldmask;
1020 cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
1019 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1021 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1020 u32 checkfid; 1022 u32 checkfid;
1021 u32 checkvid; 1023 u32 checkvid;
@@ -1030,7 +1032,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1030 1032
1031 /* only run on specific CPU from here on */ 1033 /* only run on specific CPU from here on */
1032 oldmask = current->cpus_allowed; 1034 oldmask = current->cpus_allowed;
1033 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1035 set_cpus_allowed_ptr(current, cpu_mask);
1034 1036
1035 if (smp_processor_id() != pol->cpu) { 1037 if (smp_processor_id() != pol->cpu) {
1036 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1038 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1105,6 +1107,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1105{ 1107{
1106 struct powernow_k8_data *data; 1108 struct powernow_k8_data *data;
1107 cpumask_t oldmask; 1109 cpumask_t oldmask;
1110 cpumask_of_cpu_ptr_declare(newmask);
1108 int rc; 1111 int rc;
1109 1112
1110 if (!cpu_online(pol->cpu)) 1113 if (!cpu_online(pol->cpu))
@@ -1156,7 +1159,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1156 1159
1157 /* only run on specific CPU from here on */ 1160 /* only run on specific CPU from here on */
1158 oldmask = current->cpus_allowed; 1161 oldmask = current->cpus_allowed;
1159 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1162 cpumask_of_cpu_ptr_next(newmask, pol->cpu);
1163 set_cpus_allowed_ptr(current, newmask);
1160 1164
1161 if (smp_processor_id() != pol->cpu) { 1165 if (smp_processor_id() != pol->cpu) {
1162 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1166 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1178,7 +1182,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1178 set_cpus_allowed_ptr(current, &oldmask); 1182 set_cpus_allowed_ptr(current, &oldmask);
1179 1183
1180 if (cpu_family == CPU_HW_PSTATE) 1184 if (cpu_family == CPU_HW_PSTATE)
1181 pol->cpus = cpumask_of_cpu(pol->cpu); 1185 pol->cpus = *newmask;
1182 else 1186 else
1183 pol->cpus = per_cpu(cpu_core_map, pol->cpu); 1187 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1184 data->available_cores = &(pol->cpus); 1188 data->available_cores = &(pol->cpus);
@@ -1244,6 +1248,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1244{ 1248{
1245 struct powernow_k8_data *data; 1249 struct powernow_k8_data *data;
1246 cpumask_t oldmask = current->cpus_allowed; 1250 cpumask_t oldmask = current->cpus_allowed;
1251 cpumask_of_cpu_ptr(newmask, cpu);
1247 unsigned int khz = 0; 1252 unsigned int khz = 0;
1248 unsigned int first; 1253 unsigned int first;
1249 1254
@@ -1253,7 +1258,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1253 if (!data) 1258 if (!data)
1254 return -EINVAL; 1259 return -EINVAL;
1255 1260
1256 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1261 set_cpus_allowed_ptr(current, newmask);
1257 if (smp_processor_id() != cpu) { 1262 if (smp_processor_id() != cpu) {
1258 printk(KERN_ERR PFX 1263 printk(KERN_ERR PFX
1259 "limiting to CPU %d failed in powernowk8_get\n", cpu); 1264 "limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..ca2ac13b7af2 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -28,7 +28,8 @@
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -313,9 +324,10 @@ static unsigned int get_cur_freq(unsigned int cpu)
313 unsigned l, h; 324 unsigned l, h;
314 unsigned clock_freq; 325 unsigned clock_freq;
315 cpumask_t saved_mask; 326 cpumask_t saved_mask;
327 cpumask_of_cpu_ptr(new_mask, cpu);
316 328
317 saved_mask = current->cpus_allowed; 329 saved_mask = current->cpus_allowed;
318 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 330 set_cpus_allowed_ptr(current, new_mask);
319 if (smp_processor_id() != cpu) 331 if (smp_processor_id() != cpu)
320 return 0; 332 return 0;
321 333
@@ -347,7 +359,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 359 int i;
348 360
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 361 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 362 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
363 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 364 return -ENODEV;
352 365
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 366 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +374,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 374 break;
362 375
363 if (i != N_IDS) 376 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 377 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 378
366 if (!centrino_cpu[policy->cpu]) { 379 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 380 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 381 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 382 MAINTAINER "\n");
@@ -386,23 +399,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 399 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 400 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 401 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 402 printk(KERN_INFO PFX
403 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 404 return -ENODEV;
391 } 405 }
392 } 406 }
393 407
394 freq = get_cur_freq(policy->cpu); 408 freq = get_cur_freq(policy->cpu);
395 409 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 410 /* 10uS transition latency */
397 policy->cur = freq; 411 policy->cur = freq;
398 412
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 413 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 414
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 415 ret = cpufreq_frequency_table_cpuinfo(policy,
416 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 417 if (ret)
403 return (ret); 418 return (ret);
404 419
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 420 cpufreq_frequency_table_get_attr(
421 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 422
407 return 0; 423 return 0;
408} 424}
@@ -411,12 +427,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 427{
412 unsigned int cpu = policy->cpu; 428 unsigned int cpu = policy->cpu;
413 429
414 if (!centrino_model[cpu]) 430 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 431 return -ENODEV;
416 432
417 cpufreq_frequency_table_put_attr(cpu); 433 cpufreq_frequency_table_put_attr(cpu);
418 434
419 centrino_model[cpu] = NULL; 435 per_cpu(centrino_model, cpu) = NULL;
420 436
421 return 0; 437 return 0;
422} 438}
@@ -430,17 +446,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 446 */
431static int centrino_verify (struct cpufreq_policy *policy) 447static int centrino_verify (struct cpufreq_policy *policy)
432{ 448{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 449 return cpufreq_frequency_table_verify(policy,
450 per_cpu(centrino_model, policy->cpu)->op_points);
434} 451}
435 452
436/** 453/**
437 * centrino_setpolicy - set a new CPUFreq policy 454 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 455 * @policy: new policy
439 * @target_freq: the target frequency 456 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 457 * @relation: how that frequency relates to achieved frequency
458 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 459 *
442 * Sets a new CPUFreq policy. 460 * Sets a new CPUFreq policy.
443 */ 461 */
462struct allmasks {
463 cpumask_t online_policy_cpus;
464 cpumask_t saved_mask;
465 cpumask_t set_mask;
466 cpumask_t covered_cpus;
467};
468
444static int centrino_target (struct cpufreq_policy *policy, 469static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 470 unsigned int target_freq,
446 unsigned int relation) 471 unsigned int relation)
@@ -448,48 +473,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 473 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 474 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 475 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 476 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 477 unsigned int j, k, first_cpu, tmp;
457 478 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 479 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 480 CPUMASK_PTR(saved_mask, allmasks);
481 CPUMASK_PTR(set_mask, allmasks);
482 CPUMASK_PTR(covered_cpus, allmasks);
483
484 if (unlikely(allmasks == NULL))
485 return -ENOMEM;
486
487 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
488 retval = -ENODEV;
489 goto out;
490 }
460 491
461 if (unlikely(cpufreq_frequency_table_target(policy, 492 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 493 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 494 target_freq,
464 relation, 495 relation,
465 &newstate))) { 496 &newstate))) {
466 return -EINVAL; 497 retval = -EINVAL;
498 goto out;
467 } 499 }
468 500
469#ifdef CONFIG_HOTPLUG_CPU 501#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 502 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 503 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 504#else
473 online_policy_cpus = policy->cpus; 505 *online_policy_cpus = policy->cpus;
474#endif 506#endif
475 507
476 saved_mask = current->cpus_allowed; 508 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 509 first_cpu = 1;
478 cpus_clear(covered_cpus); 510 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 511 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 512 /*
481 * Support for SMP systems. 513 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 514 * Make sure we are running on CPU that wants to change freq
483 */ 515 */
484 cpus_clear(set_mask); 516 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 517 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 518 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 519 else
488 cpu_set(j, set_mask); 520 cpu_set(j, *set_mask);
489 521
490 set_cpus_allowed_ptr(current, &set_mask); 522 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 523 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 524 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 525 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 526 retval = -EAGAIN;
495 if (first_cpu) { 527 if (first_cpu) {
@@ -500,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 532 break;
501 } 533 }
502 534
503 msr = centrino_model[cpu]->op_points[newstate].index; 535 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 536
505 if (first_cpu) { 537 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 538 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +549,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 549 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 550 target_freq, freqs.old, freqs.new, msr);
519 551
520 for_each_cpu_mask(k, online_policy_cpus) { 552 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 553 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 554 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 555 CPUFREQ_PRECHANGE);
@@ -536,11 +568,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 568 break;
537 } 569 }
538 570
539 cpu_set(j, covered_cpus); 571 cpu_set(j, *covered_cpus);
540 preempt_enable(); 572 preempt_enable();
541 } 573 }
542 574
543 for_each_cpu_mask(k, online_policy_cpus) { 575 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 576 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 577 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 578 }
@@ -553,10 +585,12 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 585 * Best effort undo..
554 */ 586 */
555 587
556 if (!cpus_empty(covered_cpus)) { 588 if (!cpus_empty(*covered_cpus)) {
557 for_each_cpu_mask(j, covered_cpus) { 589 cpumask_of_cpu_ptr_declare(new_mask);
558 set_cpus_allowed_ptr(current, 590
559 &cpumask_of_cpu(j)); 591 for_each_cpu_mask_nr(j, *covered_cpus) {
592 cpumask_of_cpu_ptr_next(new_mask, j);
593 set_cpus_allowed_ptr(current, new_mask);
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 594 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 595 }
562 } 596 }
@@ -564,19 +598,22 @@ static int centrino_target (struct cpufreq_policy *policy,
564 tmp = freqs.new; 598 tmp = freqs.new;
565 freqs.new = freqs.old; 599 freqs.new = freqs.old;
566 freqs.old = tmp; 600 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 601 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 602 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 603 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 604 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 605 }
572 } 606 }
573 set_cpus_allowed_ptr(current, &saved_mask); 607 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 608 retval = 0;
609 goto out;
575 610
576migrate_end: 611migrate_end:
577 preempt_enable(); 612 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 613 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 614out:
615 CPUMASK_FREE(allmasks);
616 return retval;
580} 617}
581 618
582static struct freq_attr* centrino_attr[] = { 619static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..2f3728dc24f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,7 +244,8 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 return _speedstep_get(&cpumask_of_cpu(cpu)); 247 cpumask_of_cpu_ptr(newmask, cpu);
248 return _speedstep_get(newmask);
248} 249}
249 250
250/** 251/**
@@ -279,7 +280,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 280
280 cpus_allowed = current->cpus_allowed; 281 cpus_allowed = current->cpus_allowed;
281 282
282 for_each_cpu_mask(i, policy->cpus) { 283 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 284 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 285 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 286 }
@@ -292,7 +293,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 293 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 294 set_cpus_allowed_ptr(current, &cpus_allowed);
294 295
295 for_each_cpu_mask(i, policy->cpus) { 296 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 297 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 298 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 299 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..b75f2569b8f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229 229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
230#ifdef CONFIG_X86_NUMAQ 240#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable(); 241 numaq_tsc_disable();
232#endif 242#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..650d40f7912b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 489 int sibling;
490 490
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 491 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 493 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 494 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 495 }
@@ -516,6 +516,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
516 unsigned long j; 516 unsigned long j;
517 int retval; 517 int retval;
518 cpumask_t oldmask; 518 cpumask_t oldmask;
519 cpumask_of_cpu_ptr(newmask, cpu);
519 520
520 if (num_cache_leaves == 0) 521 if (num_cache_leaves == 0)
521 return -ENOENT; 522 return -ENOENT;
@@ -526,7 +527,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
526 return -ENOMEM; 527 return -ENOMEM;
527 528
528 oldmask = current->cpus_allowed; 529 oldmask = current->cpus_allowed;
529 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 530 retval = set_cpus_allowed_ptr(current, newmask);
530 if (retval) 531 if (retval)
531 goto out; 532 goto out;
532 533
@@ -780,15 +781,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 781 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 782 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 783 cpuid4_cache_sysfs_exit(cpu);
783 break; 784 return retval;
784 } 785 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 786 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 787 }
787 if (!retval) 788 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 789
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 791 return 0;
792} 792}
793 793
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..65a339678ece 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
762 762
763/* Why are there no generic functions for this? */ 763/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 764#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 765 static ssize_t show_ ## name(struct sys_device *s, \
766 struct sysdev_attribute *attr, \
767 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 768 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 769 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 770 static ssize_t set_ ## name(struct sys_device *s, \
771 struct sysdev_attribute *attr, \
772 const char *buf, size_t siz) { \
769 char *end; \ 773 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 774 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 775 if (end == buf) return -EINVAL; \
@@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 790ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 791ACCESSOR(bank5ctl,bank[5],mce_restart())
788 792
789static ssize_t show_trigger(struct sys_device *s, char *buf) 793static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
794 char *buf)
790{ 795{
791 strcpy(buf, trigger); 796 strcpy(buf, trigger);
792 strcat(buf, "\n"); 797 strcat(buf, "\n");
793 return strlen(trigger) + 1; 798 return strlen(trigger) + 1;
794} 799}
795 800
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 801static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
802 const char *buf,size_t siz)
797{ 803{
798 char *p; 804 char *p;
799 int len; 805 int len;
@@ -806,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 812}
807 813
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 814static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 815static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 816ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 817static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 818 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 819 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 820 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 821 NULL
816}; 822};
817 823
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..88736cadbaa6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..de7439f82b92 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
250 250
251 do_div(count, nmi_hz); 251 do_div(count, nmi_hz);
252 if(descr) 252 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count); 254 wrmsrl(perfctr_msr, 0 - count);
255} 255}
256 256
@@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
261 261
262 do_div(count, nmi_hz); 262 do_div(count, nmi_hz);
263 if(descr) 263 if(descr)
264 Dprintk("setting %s to -0x%08Lx\n", descr, count); 264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0); 265 wrmsr(perfctr_msr, (u32)(-count), 0);
266} 266}
267 267
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..14b11b3be31c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 141{
142 struct device *dev; 142 struct device *dev;
143 143
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 144 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
145 "cpu%d", cpu); 145 NULL, "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 146 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 147}
148 148
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..9af89078f7bb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 878 count++;
879 879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
881 for (i = 0; i < count; i++) { 882 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 883 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
1298 } 1299 }
1299} 1300}
1300 1301
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1306char *__init default_machine_specific_memory_setup(void) 1302char *__init default_machine_specific_memory_setup(void)
1307{ 1303{
1308 char *who = "BIOS-e820"; 1304 char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1339
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1341{
1346 if (arch_memory_setup_quirk) { 1342 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1343 char *who = x86_quirks->arch_memory_setup();
1348 1344
1349 if (who) 1345 if (who)
1350 return who; 1346 return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1364 e820_print_map(who);
1369} 1365}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..4353cf5e6fac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..109792bc7cfa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 425 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 427 jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
383 # setting need_resched or sigpending 434 # setting need_resched or sigpending
384 # between sampling and the iret 435 # between sampling and the iret
385 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 439 jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
514syscall_trace_entry: 561syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 563 movl %esp, %eax
517 xorl %edx,%edx 564 call syscall_trace_enter
518 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 567 jnae syscall_call
525 jmp syscall_exit 568 jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 571 # perform syscall exit tracing
529 ALIGN 572 ALIGN
530syscall_exit_work: 573syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 575 jz work_pending
533 TRACE_IRQS_ON 576 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 578 # schedule() instead
536 movl %esp, %eax 579 movl %esp, %eax
537 movl $1, %edx 580 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 581 jmp resume_userspace
540END(syscall_exit_work) 582END(syscall_exit_work)
541 CFI_ENDPROC 583 CFI_ENDPROC
@@ -1024,6 +1066,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1066ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1067 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1068 addl $5*4, %esp /* remove xen-provided frame */
1069 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1070 jmp sysenter_past_esp
1028 CFI_ENDPROC 1071 CFI_ENDPROC
1029 1072
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..89434d439605 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,6 +53,12 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
@@ -349,9 +355,9 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 355 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 356 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
355 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
356 ja badsys 362 ja badsys
357 movq %r10,%rcx 363 movq %r10,%rcx
@@ -403,16 +409,16 @@ sysret_careful:
403sysret_signal: 409sysret_signal:
404 TRACE_IRQS_ON 410 TRACE_IRQS_ON
405 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
406 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
407 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
408 414 jc sysret_audit
409 /* Really a signal */ 415#endif
410 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
411 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
413 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
414 call ptregscall_common 420 call ptregscall_common
4151: movl $_TIF_WORK_MASK,%edi 421 movl $_TIF_WORK_MASK,%edi
416 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
418 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +429,56 @@ badsys:
423 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
424 jmp ret_from_sys_call 430 jmp ret_from_sys_call
425 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
426 /* Do syscall tracing */ 465 /* Do syscall tracing */
427tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
428 SAVE_REST 471 SAVE_REST
429 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
430 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 474 movq %rsp,%rdi
432 call syscall_trace_enter 475 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 476 /*
477 * Reload arg registers from stack in case ptrace changed them.
478 * We don't reload %rax because syscall_trace_enter() returned
479 * the value it wants us to use in the table lookup.
480 */
481 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 482 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 483 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 484 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +492,7 @@ tracesys:
444 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
445 */ 493 */
446 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
447int_ret_from_sys_call: 496int_ret_from_sys_call:
448 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
449 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
@@ -483,7 +532,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 532 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 533 SAVE_REST
485 /* Check for syscall exit trace */ 534 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 535 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 536 jz int_signal
488 pushq %rdi 537 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 538 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +540,7 @@ int_very_careful:
491 call syscall_trace_leave 540 call syscall_trace_leave
492 popq %rdi 541 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 542 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 543 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 544 jmp int_restore_rest
496 545
497int_signal: 546int_signal:
@@ -1189,6 +1238,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1238 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1239KPROBE_ENTRY(debug)
1191 INTR_FRAME 1240 INTR_FRAME
1241 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1242 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1243 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1244 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1248,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1248 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1249KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1250 INTR_FRAME
1251 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1252 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1253 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1254 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1262,7 @@ KPROBE_END(nmi)
1211 1262
1212KPROBE_ENTRY(int3) 1263KPROBE_ENTRY(int3)
1213 INTR_FRAME 1264 INTR_FRAME
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1266 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1267 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1268 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1289,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1289 /* runs on exception stack */
1238ENTRY(double_fault) 1290ENTRY(double_fault)
1239 XCPT_FRAME 1291 XCPT_FRAME
1292 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1293 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1294 jmp paranoid_exit1
1242 CFI_ENDPROC 1295 CFI_ENDPROC
@@ -1253,6 +1306,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1306 /* runs on exception stack */
1254ENTRY(stack_segment) 1307ENTRY(stack_segment)
1255 XCPT_FRAME 1308 XCPT_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1310 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1311 jmp paranoid_exit1
1258 CFI_ENDPROC 1312 CFI_ENDPROC
@@ -1278,6 +1332,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1332 /* runs on exception stack */
1279ENTRY(machine_check) 1333ENTRY(machine_check)
1280 INTR_FRAME 1334 INTR_FRAME
1335 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1336 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1337 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1338 paranoidentry do_machine_check
@@ -1312,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1367 sysret
1313 CFI_ENDPROC 1368 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1369ENDPROC(ignore_sysret)
1370
1371#ifdef CONFIG_XEN
1372ENTRY(xen_hypervisor_callback)
1373 zeroentry xen_do_hypervisor_callback
1374END(xen_hypervisor_callback)
1375
1376/*
1377# A note on the "critical region" in our callback handler.
1378# We want to avoid stacking callback handlers due to events occurring
1379# during handling of the last event. To do this, we keep events disabled
1380# until we've done all processing. HOWEVER, we must enable events before
1381# popping the stack frame (can't be done atomically) and so it would still
1382# be possible to get enough handler activations to overflow the stack.
1383# Although unlikely, bugs of that kind are hard to track down, so we'd
1384# like to avoid the possibility.
1385# So, on entry to the handler we detect whether we interrupted an
1386# existing activation in its critical region -- if so, we pop the current
1387# activation and restart the handler using the previous one.
1388*/
1389ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1390 CFI_STARTPROC
1391/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1392 see the correct pointer to the pt_regs */
1393 movq %rdi, %rsp # we don't return, adjust the stack frame
1394 CFI_ENDPROC
1395 CFI_DEFAULT_STACK
139611: incl %gs:pda_irqcount
1397 movq %rsp,%rbp
1398 CFI_DEF_CFA_REGISTER rbp
1399 cmovzq %gs:pda_irqstackptr,%rsp
1400 pushq %rbp # backlink for old unwinder
1401 call xen_evtchn_do_upcall
1402 popq %rsp
1403 CFI_DEF_CFA_REGISTER rsp
1404 decl %gs:pda_irqcount
1405 jmp error_exit
1406 CFI_ENDPROC
1407END(do_hypervisor_callback)
1408
1409/*
1410# Hypervisor uses this for application faults while it executes.
1411# We get here for two reasons:
1412# 1. Fault while reloading DS, ES, FS or GS
1413# 2. Fault while executing IRET
1414# Category 1 we do not need to fix up as Xen has already reloaded all segment
1415# registers that could be reloaded and zeroed the others.
1416# Category 2 we fix up by killing the current process. We cannot use the
1417# normal Linux return path in this case because if we use the IRET hypercall
1418# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1419# We distinguish between categories by comparing each saved segment register
1420# with its current contents: any discrepancy means we in category 1.
1421*/
1422ENTRY(xen_failsafe_callback)
1423 framesz = (RIP-0x30) /* workaround buggy gas */
1424 _frame framesz
1425 CFI_REL_OFFSET rcx, 0
1426 CFI_REL_OFFSET r11, 8
1427 movw %ds,%cx
1428 cmpw %cx,0x10(%rsp)
1429 CFI_REMEMBER_STATE
1430 jne 1f
1431 movw %es,%cx
1432 cmpw %cx,0x18(%rsp)
1433 jne 1f
1434 movw %fs,%cx
1435 cmpw %cx,0x20(%rsp)
1436 jne 1f
1437 movw %gs,%cx
1438 cmpw %cx,0x28(%rsp)
1439 jne 1f
1440 /* All segments match their saved values => Category 2 (Bad IRET). */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 pushq %r11
1450 CFI_ADJUST_CFA_OFFSET 8
1451 pushq %rcx
1452 CFI_ADJUST_CFA_OFFSET 8
1453 jmp general_protection
1454 CFI_RESTORE_STATE
14551: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1456 movq (%rsp),%rcx
1457 CFI_RESTORE rcx
1458 movq 8(%rsp),%r11
1459 CFI_RESTORE r11
1460 addq $0x30,%rsp
1461 CFI_ADJUST_CFA_OFFSET -0x30
1462 pushq $0
1463 CFI_ADJUST_CFA_OFFSET 8
1464 SAVE_ALL
1465 jmp error_exit
1466 CFI_ENDPROC
1467END(xen_failsafe_callback)
1468
1469#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..786548a62d38 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 168 * May as well be the first.
169 */ 169 */
170 cpu = first_cpu(cpumask); 170 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 171 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 172 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 173 else
174 return BAD_APICID; 174 return BAD_APICID;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..2cfcbded888a 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
27 28
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 41short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
42 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 48
45static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -94,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
94{ 98{
95 unsigned int cpu; 99 unsigned int cpu;
96 100
97 for (cpu = 0; cpu < NR_CPUS; ++cpu) 101 for_each_possible_cpu(cpu)
98 if (cpu_isset(cpu, mask)) 102 if (cpu_isset(cpu, mask))
99 uv_send_IPI_one(cpu, vector); 103 uv_send_IPI_one(cpu, vector);
100} 104}
@@ -128,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
128 * May as well be the first. 132 * May as well be the first.
129 */ 133 */
130 cpu = first_cpu(cpumask); 134 cpu = first_cpu(cpumask);
131 if ((unsigned)cpu < NR_CPUS) 135 if ((unsigned)cpu < nr_cpu_ids)
132 return per_cpu(x86_cpu_to_apicid, cpu); 136 return per_cpu(x86_cpu_to_apicid, cpu);
133 else 137 else
134 return BAD_APICID; 138 return BAD_APICID;
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273} 277}
274 278
279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
275static __init void uv_system_init(void) 296static __init void uv_system_init(void)
276{ 297{
277 union uvh_si_addr_map_config_u m_n_config; 298 union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 347 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 348 ~((1 << n_val) - 1)) << m_val;
328 349
350 uv_rtc_init();
351
329 for_each_present_cpu(cpu) { 352 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 353 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 354 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..a7010c3a377a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP
4561: 4561:
457#endif /* CONFIG_SMP */ 457#endif /* CONFIG_SMP */
458 jmp *(initial_code) 458 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 459
463/* 460/*
464 * We depend on ET to be correct. This checks for 287/387. 461 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +598,11 @@ ignore_int:
601#endif 598#endif
602 iret 599 iret
603 600
601.section .cpuinit.data,"wa"
602.align 4
603ENTRY(initial_code)
604 .long i386_start_kernel
605
604.section .text 606.section .text
605/* 607/*
606 * Real beginning of normal "text" segment 608 * Real beginning of normal "text" segment
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..ad2b15a1334d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -468,7 +468,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 468#define RTC_NUM_INTS 1
469 469
470static unsigned long hpet_rtc_flags; 470static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 471static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 472static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 473static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 474static unsigned long hpet_t1_cmp;
@@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 575
576 hpet_rtc_flags |= bit_mask; 576 hpet_rtc_flags |= bit_mask;
577 577
578 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
579 hpet_prev_update_sec = -1;
580
578 if (!oldbits) 581 if (!oldbits)
579 hpet_rtc_timer_init(); 582 hpet_rtc_timer_init();
580 583
@@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 655 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 656 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 657 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 658 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 659 lost_ints);
657 } 660 }
658} 661}
@@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 673
671 if (hpet_rtc_flags & RTC_UIE && 674 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 675 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 676 if (hpet_prev_update_sec >= 0)
677 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 678 hpet_prev_update_sec = curr_time.tm_sec;
675 } 679 }
676 680
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 558abf4c796a..de9aa0e3a9c5 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
756 /* 756 /*
757 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
758 */ 758 */
759 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
760} 760}
761#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
762 762
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
2030 unsigned long v; 2030 unsigned long v;
2031 2031
2032 v = apic_read(APIC_LVT0); 2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2033 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2034} 2034}
2035 2035
2036static void unmask_lapic_irq(unsigned int irq) 2036static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
2038 unsigned long v; 2038 unsigned long v;
2039 2039
2040 v = apic_read(APIC_LVT0); 2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2041 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042} 2042}
2043 2043
2044static struct irq_chip lapic_chip __read_mostly = { 2044static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
2168 * The AEOI mode will finish them in the 8259A 2168 * The AEOI mode will finish them in the 8259A
2169 * automatically. 2169 * automatically.
2170 */ 2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2171 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1); 2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174 2174
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
2177 pin2 = ioapic_i8259.pin; 2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic; 2178 apic2 = ioapic_i8259.apic;
2179 2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2180 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2181 vector, apic1, pin1, apic2, pin2); 2181 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2182 vector, apic1, pin1, apic2, pin2);
2182 2183
2183 /* 2184 /*
2184 * Some BIOS writers are clueless and report the ExtINTA 2185 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
2216 } 2217 }
2217 clear_IO_APIC_pin(apic1, pin1); 2218 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1) 2219 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: " 2220 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n"); 2221 "8254 timer not connected to IO-APIC\n");
2221 2222
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) " 2223 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2223 "through the 8259A ... "); 2224 "(IRQ0) through the 8259A ...\n");
2224 printk("\n..... (found pin %d) ...", pin2); 2225 apic_printk(APIC_QUIET, KERN_INFO
2226 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2225 /* 2227 /*
2226 * legacy devices should be connected to IO APIC #0 2228 * legacy devices should be connected to IO APIC #0
2227 */ 2229 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
2230 unmask_IO_APIC_irq(0); 2232 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0); 2233 enable_8259A_irq(0);
2232 if (timer_irq_works()) { 2234 if (timer_irq_works()) {
2233 printk("works.\n"); 2235 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2234 timer_through_8259 = 1; 2236 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) { 2237 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0); 2238 disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
2244 */ 2246 */
2245 disable_8259A_irq(0); 2247 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2); 2248 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n"); 2249 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2248 } 2250 }
2249 2251
2250 if (nmi_watchdog == NMI_IO_APIC) { 2252 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2253 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2254 "through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE; 2255 nmi_watchdog = NMI_NONE;
2253 } 2256 }
2254 timer_ack = 0; 2257 timer_ack = 0;
2255 2258
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2259 apic_printk(APIC_QUIET, KERN_INFO
2260 "...trying to set up timer as Virtual Wire IRQ...\n");
2257 2261
2258 lapic_register_intr(0, vector); 2262 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2263 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0); 2264 enable_8259A_irq(0);
2261 2265
2262 if (timer_irq_works()) { 2266 if (timer_irq_works()) {
2263 printk(" works.\n"); 2267 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out; 2268 goto out;
2265 } 2269 }
2266 disable_8259A_irq(0); 2270 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2271 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n"); 2272 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269 2273
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2274 apic_printk(APIC_QUIET, KERN_INFO
2275 "...trying to set up timer as ExtINT IRQ...\n");
2271 2276
2272 init_8259A(0); 2277 init_8259A(0);
2273 make_8259A_irq(0); 2278 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2279 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2275 2280
2276 unlock_ExtINT_logic(); 2281 unlock_ExtINT_logic();
2277 2282
2278 if (timer_irq_works()) { 2283 if (timer_irq_works()) {
2279 printk(" works.\n"); 2284 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2280 goto out; 2285 goto out;
2281 } 2286 }
2282 printk(" failed :(.\n"); 2287 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2288 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option"); 2289 "report. Then try booting with the 'noapic' option.\n");
2285out: 2290out:
2286 local_irq_restore(flags); 2291 local_irq_restore(flags);
2287} 2292}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6510cde36b35..8269434d1707 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -731,7 +732,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
731 return 0; 732 return 0;
732 } 733 }
733 734
734 for_each_cpu_mask(cpu, mask) { 735 for_each_cpu_mask_nr(cpu, mask) {
735 cpumask_t domain, new_mask; 736 cpumask_t domain, new_mask;
736 int new_cpu; 737 int new_cpu;
737 int vector, offset; 738 int vector, offset;
@@ -752,7 +753,7 @@ next:
752 continue; 753 continue;
753 if (vector == IA32_SYSCALL_VECTOR) 754 if (vector == IA32_SYSCALL_VECTOR)
754 goto next; 755 goto next;
755 for_each_cpu_mask(new_cpu, new_mask) 756 for_each_cpu_mask_nr(new_cpu, new_mask)
756 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 757 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
757 goto next; 758 goto next;
758 /* Found one! */ 759 /* Found one! */
@@ -762,7 +763,7 @@ next:
762 cfg->move_in_progress = 1; 763 cfg->move_in_progress = 1;
763 cfg->old_domain = cfg->domain; 764 cfg->old_domain = cfg->domain;
764 } 765 }
765 for_each_cpu_mask(new_cpu, new_mask) 766 for_each_cpu_mask_nr(new_cpu, new_mask)
766 per_cpu(vector_irq, new_cpu)[vector] = irq; 767 per_cpu(vector_irq, new_cpu)[vector] = irq;
767 cfg->vector = vector; 768 cfg->vector = vector;
768 cfg->domain = domain; 769 cfg->domain = domain;
@@ -794,7 +795,7 @@ static void __clear_irq_vector(int irq)
794 795
795 vector = cfg->vector; 796 vector = cfg->vector;
796 cpus_and(mask, cfg->domain, cpu_online_map); 797 cpus_and(mask, cfg->domain, cpu_online_map);
797 for_each_cpu_mask(cpu, mask) 798 for_each_cpu_mask_nr(cpu, mask)
798 per_cpu(vector_irq, cpu)[vector] = -1; 799 per_cpu(vector_irq, cpu)[vector] = -1;
799 800
800 cfg->vector = 0; 801 cfg->vector = 0;
@@ -1372,12 +1373,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1372static int ioapic_retrigger_irq(unsigned int irq) 1373static int ioapic_retrigger_irq(unsigned int irq)
1373{ 1374{
1374 struct irq_cfg *cfg = &irq_cfg[irq]; 1375 struct irq_cfg *cfg = &irq_cfg[irq];
1375 cpumask_t mask;
1376 unsigned long flags; 1376 unsigned long flags;
1377 1377
1378 spin_lock_irqsave(&vector_lock, flags); 1378 spin_lock_irqsave(&vector_lock, flags);
1379 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1379 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1380 send_IPI_mask(mask, cfg->vector);
1381 spin_unlock_irqrestore(&vector_lock, flags); 1380 spin_unlock_irqrestore(&vector_lock, flags);
1382 1381
1383 return 1; 1382 return 1;
@@ -1696,8 +1695,9 @@ static inline void __init check_timer(void)
1696 pin2 = ioapic_i8259.pin; 1695 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 1696 apic2 = ioapic_i8259.apic;
1698 1697
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1698 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 1699 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1700 cfg->vector, apic1, pin1, apic2, pin2);
1701 1701
1702 /* 1702 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 1703 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1735,13 @@ static inline void __init check_timer(void)
1735 } 1735 }
1736 clear_IO_APIC_pin(apic1, pin1); 1736 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 1737 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 1738 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 1739 "8254 timer not connected to IO-APIC\n");
1740 1740
1741 apic_printk(APIC_VERBOSE,KERN_INFO 1741 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 1742 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 1743 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1744 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 1745 /*
1747 * legacy devices should be connected to IO APIC #0 1746 * legacy devices should be connected to IO APIC #0
1748 */ 1747 */
@@ -1751,7 +1750,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 1750 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 1751 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 1752 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 1753 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 1754 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 1755 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 1756 disable_8259A_irq(0);
@@ -1765,29 +1764,32 @@ static inline void __init check_timer(void)
1765 */ 1764 */
1766 disable_8259A_irq(0); 1765 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 1766 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 1767 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 1768 }
1770 1769
1771 if (nmi_watchdog == NMI_IO_APIC) { 1770 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1771 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1772 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 1773 nmi_watchdog = NMI_NONE;
1774 } 1774 }
1775 1775
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1776 apic_printk(APIC_QUIET, KERN_INFO
1777 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 1778
1778 lapic_register_intr(0); 1779 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1780 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 1781 enable_8259A_irq(0);
1781 1782
1782 if (timer_irq_works()) { 1783 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 1784 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 1785 goto out;
1785 } 1786 }
1786 disable_8259A_irq(0); 1787 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1788 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 1789 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 1790
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1791 apic_printk(APIC_QUIET, KERN_INFO
1792 "...trying to set up timer as ExtINT IRQ...\n");
1791 1793
1792 init_8259A(0); 1794 init_8259A(0);
1793 make_8259A_irq(0); 1795 make_8259A_irq(0);
@@ -1796,11 +1798,12 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 1798 unlock_ExtINT_logic();
1797 1799
1798 if (timer_irq_works()) { 1800 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 1801 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 1802 goto out;
1801 } 1803 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 1804 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1805 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1806 "report. Then try booting with the 'noapic' option.\n");
1804out: 1807out:
1805 local_irq_restore(flags); 1808 local_irq_restore(flags);
1806} 1809}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..1c3a66a67f83 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
103 103
104static int __init io_delay_param(char *s) 104static int __init io_delay_param(char *s)
105{ 105{
106 if (!s)
107 return -EINVAL;
108
106 if (!strcmp(s, "0x80")) 109 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 110 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 111 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..3f7537b669d3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 70 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 72 */
73 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
74} 74}
75 75
76void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 98 * prepare target chip field
99 */ 99 */
100 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
102 102
103 /* 103 /*
104 * program the ICR 104 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 108 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 110 */
111 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
112} 112}
113 113
114/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..1f26fd9ec4f4 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..f2d43bc75514 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
209{ 213{
210 int error = 0; 214 int error = 0;
211 215
216 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
217 if (!arch_debugfs_dir)
218 return -ENOMEM;
219
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 220#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 221 error = boot_params_kdebugfs_init();
214#endif 222#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..d02def06ca91 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..3fee2aa50f3f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -62,12 +62,12 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 62
63 if (reload) { 63 if (reload) {
64#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
65 cpumask_t mask; 65 cpumask_of_cpu_ptr_declare(mask);
66 66
67 preempt_disable(); 67 preempt_disable();
68 load_LDT(pc); 68 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 69 cpumask_of_cpu_ptr_next(mask, smp_processor_id());
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 70 if (!cpus_equal(current->mm->cpu_vm_mask, *mask))
71 smp_call_function(flush_ldt, current->mm, 1); 71 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 72 preempt_enable();
73#else 73#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..9fe478d98406 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/system.h> 24#include <asm/system.h>
25#include <asm/cacheflush.h>
25 26
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 27#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 28static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 86 * reboot code buffer to allow us to avoid allocations
86 * later. 87 * later.
87 * 88 *
88 * Currently nothing. 89 * Make control page executable.
89 */ 90 */
90int machine_kexec_prepare(struct kimage *image) 91int machine_kexec_prepare(struct kimage *image)
91{ 92{
93 if (nx_enabled)
94 set_pages_x(image->control_code_page, 1);
92 return 0; 95 return 0;
93} 96}
94 97
@@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 101 */
99void machine_kexec_cleanup(struct kimage *image) 102void machine_kexec_cleanup(struct kimage *image)
100{ 103{
104 if (nx_enabled)
105 set_pages_nx(image->control_code_page, 1);
101} 106}
102 107
103/* 108/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 109 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 110 * We are past the point of no return, committed to rebooting now.
106 */ 111 */
107NORET_TYPE void machine_kexec(struct kimage *image) 112void machine_kexec(struct kimage *image)
108{ 113{
109 unsigned long page_list[PAGES_NR]; 114 unsigned long page_list[PAGES_NR];
110 void *control_page; 115 void *control_page;
116 asmlinkage unsigned long
117 (*relocate_kernel_ptr)(unsigned long indirection_page,
118 unsigned long control_page,
119 unsigned long start_address,
120 unsigned int has_pae,
121 unsigned int preserve_context);
111 122
112 tracer_disable(); 123 tracer_disable();
113 124
114 /* Interrupts aren't acceptable while we reboot */ 125 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 126 local_irq_disable();
116 127
128 if (image->preserve_context) {
129#ifdef CONFIG_X86_IO_APIC
130 /* We need to put APICs in legacy mode so that we can
131 * get timer interrupts in second kernel. kexec/kdump
132 * paths already have calls to disable_IO_APIC() in
133 * one form or other. kexec jump path also need
134 * one.
135 */
136 disable_IO_APIC();
137#endif
138 }
139
117 control_page = page_address(image->control_code_page); 140 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 141 memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
119 142
143 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 144 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 145 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 146 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 147 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 148#ifdef CONFIG_X86_PAE
@@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 155 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 156 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 157 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
158 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 159
135 /* The segment registers are funny things, they have both a 160 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 161 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 174 set_idt(phys_to_virt(0),0);
150 175
151 /* now call it */ 176 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 177 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 178 (unsigned long)page_list,
179 image->start, cpu_has_pae,
180 image->preserve_context);
154} 181}
155 182
156void arch_crash_save_vmcoreinfo(void) 183void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a04..6994c751590e 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -388,6 +388,7 @@ static int do_microcode_update (void)
388 void *new_mc = NULL; 388 void *new_mc = NULL;
389 int cpu; 389 int cpu;
390 cpumask_t old; 390 cpumask_t old;
391 cpumask_of_cpu_ptr_declare(newmask);
391 392
392 old = current->cpus_allowed; 393 old = current->cpus_allowed;
393 394
@@ -404,7 +405,8 @@ static int do_microcode_update (void)
404 405
405 if (!uci->valid) 406 if (!uci->valid)
406 continue; 407 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 408 cpumask_of_cpu_ptr_next(newmask, cpu);
409 set_cpus_allowed_ptr(current, newmask);
408 error = get_maching_microcode(new_mc, cpu); 410 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0) 411 if (error < 0)
410 goto out; 412 goto out;
@@ -574,6 +576,7 @@ static int apply_microcode_check_cpu(int cpu)
574 struct cpuinfo_x86 *c = &cpu_data(cpu); 576 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 577 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old; 578 cpumask_t old;
579 cpumask_of_cpu_ptr(newmask, cpu);
577 unsigned int val[2]; 580 unsigned int val[2];
578 int err = 0; 581 int err = 0;
579 582
@@ -582,7 +585,7 @@ static int apply_microcode_check_cpu(int cpu)
582 return 0; 585 return 0;
583 586
584 old = current->cpus_allowed; 587 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 588 set_cpus_allowed_ptr(current, newmask);
586 589
587 /* Check if the microcode we have in memory matches the CPU */ 590 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 591 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -620,11 +623,12 @@ static int apply_microcode_check_cpu(int cpu)
620static void microcode_init_cpu(int cpu, int resume) 623static void microcode_init_cpu(int cpu, int resume)
621{ 624{
622 cpumask_t old; 625 cpumask_t old;
626 cpumask_of_cpu_ptr(newmask, cpu);
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 627 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624 628
625 old = current->cpus_allowed; 629 old = current->cpus_allowed;
626 630
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 631 set_cpus_allowed_ptr(current, newmask);
628 mutex_lock(&microcode_mutex); 632 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu); 633 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume) 634 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
@@ -644,7 +648,9 @@ static void microcode_fini_cpu(int cpu)
644 mutex_unlock(&microcode_mutex); 648 mutex_unlock(&microcode_mutex);
645} 649}
646 650
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) 651static ssize_t reload_store(struct sys_device *dev,
652 struct sysdev_attribute *attr,
653 const char *buf, size_t sz)
648{ 654{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 655 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end; 656 char *end;
@@ -656,11 +662,12 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
656 return -EINVAL; 662 return -EINVAL;
657 if (val == 1) { 663 if (val == 1) {
658 cpumask_t old; 664 cpumask_t old;
665 cpumask_of_cpu_ptr(newmask, cpu);
659 666
660 old = current->cpus_allowed; 667 old = current->cpus_allowed;
661 668
662 get_online_cpus(); 669 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 670 set_cpus_allowed_ptr(current, newmask);
664 671
665 mutex_lock(&microcode_mutex); 672 mutex_lock(&microcode_mutex);
666 if (uci->valid) 673 if (uci->valid)
@@ -674,14 +681,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
674 return sz; 681 return sz;
675} 682}
676 683
677static ssize_t version_show(struct sys_device *dev, char *buf) 684static ssize_t version_show(struct sys_device *dev,
685 struct sysdev_attribute *attr, char *buf)
678{ 686{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 687 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680 688
681 return sprintf(buf, "0x%x\n", uci->rev); 689 return sprintf(buf, "0x%x\n", uci->rev);
682} 690}
683 691
684static ssize_t pf_show(struct sys_device *dev, char *buf) 692static ssize_t pf_show(struct sys_device *dev,
693 struct sysdev_attribute *attr, char *buf)
685{ 694{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 695 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687 696
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..6ae005ccaed8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
@@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 578static struct intel_mp_floating *mpf_found;
727 579
728/* 580/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
736 */ 582 */
737static void __init __get_smp_config(unsigned int early) 583static void __init __get_smp_config(unsigned int early)
738{ 584{
739 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
740 586
741 if (mach_get_smp_config_quirk) { 587 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 588 if (x86_quirks->mach_get_smp_config(early))
743 return; 589 return;
744 } 590 }
745 if (acpi_lapic && early) 591 if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 745 return 0;
900} 746}
901 747
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 748static void __init __find_smp_config(unsigned int reserve)
905{ 749{
906 unsigned int address; 750 unsigned int address;
907 751
908 if (mach_find_smp_config_quirk) { 752 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 753 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 754 return;
911 } 755 }
912 /* 756 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..9fd809552447 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu)
149{ 149{
150 struct device *dev; 150 struct device *dev;
151 151
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 152 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
153 "msr%d", cpu); 153 NULL, "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 154 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 155}
156 156
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..ac6d51222e7d 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 263
264static void __acpi_nmi_enable(void *__unused) 264static void __acpi_nmi_enable(void *__unused)
265{ 265{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 266 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 267}
268 268
269/* 269/*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
277 277
278static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
279{ 279{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 280 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 281}
282 282
283/* 283/*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 448
449#ifdef CONFIG_SYSCTL 449#ifdef CONFIG_SYSCTL
450 450
451static int __init setup_unknown_nmi_panic(char *str)
452{
453 unknown_nmi_panic = 1;
454 return 1;
455}
456__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
457
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 458static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 459{
453 unsigned char reason = get_nmi_reason(); 460 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..b8c45610b20a 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __init numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
74static __init void early_check_numaq(void) 257static __init void early_check_numaq(void)
75{ 258{
76 /* 259 /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
82 */ 265 */
83 if (smp_found_config) 266 if (smp_found_config)
84 early_get_smp_config(); 267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
85} 271}
86 272
87int __init get_memcfg_numaq(void) 273int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 278 smp_dump_qct();
93 return 1; 279 return 1;
94} 280}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..94da4d52d798 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
266 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
267} 269}
268 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
269struct pv_info pv_info = { 282struct pv_info pv_info = {
270 .name = "bare hardware", 283 .name = "bare hardware",
271 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
361struct pv_apic_ops pv_apic_ops = { 374struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 375#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write, 376 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read, 377 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 378 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 379 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 385#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 386 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 387 .pagetable_setup_done = native_pagetable_setup_done,
388#else
389 .pagetable_setup_start = paravirt_nop,
390 .pagetable_setup_done = paravirt_nop,
376#endif 391#endif
377 392
378 .read_cr2 = native_read_cr2, 393 .read_cr2 = native_read_cr2,
@@ -428,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
428#endif /* PAGETABLE_LEVELS >= 3 */ 443#endif /* PAGETABLE_LEVELS >= 3 */
429 444
430 .pte_val = native_pte_val, 445 .pte_val = native_pte_val,
431 .pte_flags = native_pte_val, 446 .pte_flags = native_pte_flags,
432 .pgd_val = native_pgd_val, 447 .pgd_val = native_pgd_val,
433 448
434 .make_pte = native_make_pte, 449 .make_pte = native_make_pte,
@@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
446 .set_fixmap = native_set_fixmap, 461 .set_fixmap = native_set_fixmap,
447}; 462};
448 463
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL_GPL(pv_lock_ops);
475
449EXPORT_SYMBOL_GPL(pv_time_ops); 476EXPORT_SYMBOL_GPL(pv_time_ops);
450EXPORT_SYMBOL (pv_cpu_ops); 477EXPORT_SYMBOL (pv_cpu_ops);
451EXPORT_SYMBOL (pv_mmu_ops); 478EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..b67a4b1d4eae 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 414 }
411} 415}
412 416
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 417static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 418 int nelems, int direction)
431{ 419{
@@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 424 unsigned long entry;
437 int i; 425 int i;
438 426
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 427 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 428 BUG_ON(!sg_page(s));
444 429
@@ -474,7 +459,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 459static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 460 size_t size, int direction)
476{ 461{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 462 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 463 unsigned long uaddr;
480 unsigned int npages; 464 unsigned int npages;
@@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 467 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 468 npages = num_dma_pages(uaddr, size);
485 469
486 if (translation_enabled(tbl)) 470 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 471}
493 472
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 473static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 476 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 477 unsigned int npages;
499 478
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 479 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 480 iommu_free(tbl, dma_handle, npages);
505} 481}
@@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
522 goto error; 498 goto error;
523 memset(ret, 0, size); 499 memset(ret, 0, size);
524 500
525 if (translation_enabled(tbl)) { 501 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 502 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 503 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 504 goto free;
529 goto free; 505 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 506 return ret;
536
537free: 507free:
538 free_pages((unsigned long)ret, get_order(size)); 508 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 509 ret = NULL;
@@ -541,7 +511,7 @@ error:
541 return ret; 511 return ret;
542} 512}
543 513
544static const struct dma_mapping_ops calgary_dma_ops = { 514static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 515 .alloc_coherent = calgary_alloc_coherent,
546 .map_single = calgary_map_single, 516 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 517 .unmap_single = calgary_unmap_single,
@@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 800
831 tbl = pci_iommu(dev->bus); 801 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 802 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 803
804 if (is_kdump_kernel())
805 calgary_init_bitmap_from_tce_table(tbl);
806 else
807 tce_free(tbl, 0, tbl->it_size);
834 808
835 if (is_calgary(dev->device)) 809 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 810 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1183,10 @@ static int __init calgary_init(void)
1209 if (ret) 1183 if (ret)
1210 return ret; 1184 return ret;
1211 1185
1186 /* Purely for kdump kernel case */
1187 if (is_kdump_kernel())
1188 get_tce_space_from_tar();
1189
1212 do { 1190 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1191 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1192 if (!dev)
@@ -1230,6 +1208,16 @@ static int __init calgary_init(void)
1230 goto error; 1208 goto error;
1231 } while (1); 1209 } while (1);
1232 1210
1211 dev = NULL;
1212 for_each_pci_dev(dev) {
1213 struct iommu_table *tbl;
1214
1215 tbl = find_iommu_table(&dev->dev);
1216
1217 if (translation_enabled(tbl))
1218 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1219 }
1220
1233 return ret; 1221 return ret;
1234 1222
1235error: 1223error:
@@ -1251,6 +1239,7 @@ error:
1251 calgary_disable_translation(dev); 1239 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1240 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1241 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1242 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1243 } while (1);
1255 1244
1256 return ret; 1245 return ret;
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1328 return (val != 0xffffffff);
1340} 1329}
1341 1330
1331/*
1332 * calgary_init_bitmap_from_tce_table():
1333 * Funtion for kdump case. In the second/kdump kernel initialize
1334 * the bitmap based on the tce table entries obtained from first kernel
1335 */
1336static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1337{
1338 u64 *tp;
1339 unsigned int index;
1340 tp = ((u64 *)tbl->it_base);
1341 for (index = 0 ; index < tbl->it_size; index++) {
1342 if (*tp != 0x0)
1343 set_bit(index, tbl->it_map);
1344 tp++;
1345 }
1346}
1347
1348/*
1349 * get_tce_space_from_tar():
1350 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu
1352 */
1353static void get_tce_space_from_tar()
1354{
1355 int bus;
1356 void __iomem *target;
1357 unsigned long tce_space;
1358
1359 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1360 struct calgary_bus_info *info = &bus_info[bus];
1361 unsigned short pci_device;
1362 u32 val;
1363
1364 val = read_pci_config(bus, 0, 0, 0);
1365 pci_device = (val & 0xFFFF0000) >> 16;
1366
1367 if (!is_cal_pci_dev(pci_device))
1368 continue;
1369 if (info->translation_disabled)
1370 continue;
1371
1372 if (calgary_bus_has_devices(bus, pci_device) ||
1373 translate_empty_slots) {
1374 target = calgary_reg(bus_info[bus].bbar,
1375 tar_offset(bus));
1376 tce_space = be64_to_cpu(readq(target));
1377 tce_space = tce_space & TAR_SW_BITS;
1378
1379 tce_space = tce_space & (~specified_table_size);
1380 info->tce_space = (u64 *)__va(tce_space);
1381 }
1382 }
1383 return;
1384}
1385
1342void __init detect_calgary(void) 1386void __init detect_calgary(void)
1343{ 1387{
1344 int bus; 1388 int bus;
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void)
1394 return; 1438 return;
1395 } 1439 }
1396 1440
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1441 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1442 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1443
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1444 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1445 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void)
1412 1457
1413 if (calgary_bus_has_devices(bus, pci_device) || 1458 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1459 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1460 /*
1416 if (!tbl) 1461 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1462 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1463 */
1464 if (!is_kdump_kernel()) {
1465 tbl = alloc_tce_table();
1466 if (!tbl)
1467 goto cleanup;
1468 info->tce_space = tbl;
1469 }
1419 calgary_found = 1; 1470 calgary_found = 1;
1420 } 1471 }
1421 } 1472 }
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1481 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1482 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1483 debugging ? "enabled" : "disabled");
1484
1485 /* swiotlb for devices that aren't behind the Calgary. */
1486 if (max_pfn > MAX_DMA32_PFN)
1487 swiotlb = 1;
1433 } 1488 }
1434 return; 1489 return;
1435 1490
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void)
1446{ 1501{
1447 int ret; 1502 int ret;
1448 1503
1449 if (no_iommu || swiotlb) 1504 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1505 return -ENODEV;
1451 1506
1452 if (!calgary_detected) 1507 if (!calgary_detected)
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1514 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1515 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1516 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1517 return ret;
1466 } 1518 }
1467 1519
1468 force_iommu = 1; 1520 force_iommu = 1;
1469 bad_dma_address = 0x0; 1521 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1522 /* dma_ops is set to swiotlb or nommu */
1523 if (!dma_ops)
1524 dma_ops = &nommu_dma_ops;
1471 1525
1472 return 0; 1526 return 0;
1473} 1527}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b7dd70fda031..8dbffb846de9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,13 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13EXPORT_SYMBOL(forbid_dac);
14 13
15const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
17 16
18static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 113 * The order of these functions is important for
115 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
116 */ 115 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
119#endif
120 117
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 118 detect_calgary();
123#endif
124 119
125 detect_intel_iommu(); 120 detect_intel_iommu();
126 121
127 amd_iommu_detect(); 122 amd_iommu_detect();
128 123
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 124 pci_swiotlb_init();
131#endif
132} 125}
133#endif 126#endif
134 127
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 177 swiotlb = 1;
185#endif 178#endif
186 179
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 180 gart_parse_options(p);
189#endif
190 181
191#ifdef CONFIG_CALGARY_IOMMU 182#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 183 if (!strncmp(p, "calgary", 7))
@@ -203,16 +194,17 @@ early_param("iommu", iommu_setup);
203 194
204int dma_supported(struct device *dev, u64 mask) 195int dma_supported(struct device *dev, u64 mask)
205{ 196{
197 struct dma_mapping_ops *ops = get_dma_ops(dev);
198
206#ifdef CONFIG_PCI 199#ifdef CONFIG_PCI
207 if (mask > 0xffffffff && forbid_dac > 0) { 200 if (mask > 0xffffffff && forbid_dac > 0) {
208 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 201 dev_info(dev, "PCI: Disallowing DAC for device\n");
209 dev->bus_id);
210 return 0; 202 return 0;
211 } 203 }
212#endif 204#endif
213 205
214 if (dma_ops->dma_supported) 206 if (ops->dma_supported)
215 return dma_ops->dma_supported(dev, mask); 207 return ops->dma_supported(dev, mask);
216 208
217 /* Copied from i386. Doesn't make much sense, because it will 209 /* Copied from i386. Doesn't make much sense, because it will
218 only work for pci_alloc_coherent. 210 only work for pci_alloc_coherent.
@@ -233,8 +225,7 @@ int dma_supported(struct device *dev, u64 mask)
233 type. Normally this doesn't make any difference, but gives 225 type. Normally this doesn't make any difference, but gives
234 more gentle handling of IOMMU overflow. */ 226 more gentle handling of IOMMU overflow. */
235 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 227 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
236 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 228 dev_info(dev, "Force SAC with mask %Lx\n", mask);
237 dev->bus_id, mask);
238 return 0; 229 return 0;
239 } 230 }
240 231
@@ -260,6 +251,7 @@ void *
260dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 251dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
261 gfp_t gfp) 252 gfp_t gfp)
262{ 253{
254 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL; 255 void *memory = NULL;
264 struct page *page; 256 struct page *page;
265 unsigned long dma_mask = 0; 257 unsigned long dma_mask = 0;
@@ -328,8 +320,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
328 /* Let low level make its own zone decisions */ 320 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA); 321 gfp &= ~(GFP_DMA32|GFP_DMA);
330 322
331 if (dma_ops->alloc_coherent) 323 if (ops->alloc_coherent)
332 return dma_ops->alloc_coherent(dev, size, 324 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp); 325 dma_handle, gfp);
334 return NULL; 326 return NULL;
335 } 327 }
@@ -341,14 +333,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
341 } 333 }
342 } 334 }
343 335
344 if (dma_ops->alloc_coherent) { 336 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size)); 337 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32); 338 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); 339 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 } 340 }
349 341
350 if (dma_ops->map_simple) { 342 if (ops->map_simple) {
351 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), 343 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size, 344 size,
353 PCI_DMA_BIDIRECTIONAL); 345 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address) 346 if (*dma_handle != bad_dma_address)
@@ -370,29 +362,27 @@ EXPORT_SYMBOL(dma_alloc_coherent);
370void dma_free_coherent(struct device *dev, size_t size, 362void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus) 363 void *vaddr, dma_addr_t bus)
372{ 364{
365 struct dma_mapping_ops *ops = get_dma_ops(dev);
366
373 int order = get_order(size); 367 int order = get_order(size);
374 WARN_ON(irqs_disabled()); /* for portability */ 368 WARN_ON(irqs_disabled()); /* for portability */
375 if (dma_release_from_coherent(dev, order, vaddr)) 369 if (dma_release_from_coherent(dev, order, vaddr))
376 return; 370 return;
377 if (dma_ops->unmap_single) 371 if (ops->unmap_single)
378 dma_ops->unmap_single(dev, bus, size, 0); 372 ops->unmap_single(dev, bus, size, 0);
379 free_pages((unsigned long)vaddr, order); 373 free_pages((unsigned long)vaddr, order);
380} 374}
381EXPORT_SYMBOL(dma_free_coherent); 375EXPORT_SYMBOL(dma_free_coherent);
382 376
383static int __init pci_iommu_init(void) 377static int __init pci_iommu_init(void)
384{ 378{
385#ifdef CONFIG_CALGARY_IOMMU
386 calgary_iommu_init(); 379 calgary_iommu_init();
387#endif
388 380
389 intel_iommu_init(); 381 intel_iommu_init();
390 382
391 amd_iommu_init(); 383 amd_iommu_init();
392 384
393#ifdef CONFIG_GART_IOMMU
394 gart_iommu_init(); 385 gart_iommu_init();
395#endif
396 386
397 no_iommu_init(); 387 no_iommu_init();
398 return 0; 388 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..744126e64950 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
197 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
198 */ 199 */
199 200
200 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
202 size, dev->bus_id);
203 202
204 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -693,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
693 692
694extern int agp_amd64_init(void); 693extern int agp_amd64_init(void);
695 694
696static const struct dma_mapping_ops gart_dma_ops = { 695static struct dma_mapping_ops gart_dma_ops = {
697 .mapping_error = NULL,
698 .map_single = gart_map_single, 696 .map_single = gart_map_single,
699 .map_simple = gart_map_simple, 697 .map_simple = gart_map_simple,
700 .unmap_single = gart_unmap_single, 698 .unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75struct dma_mapping_ops nommu_dma_ops = {
76static int nommu_mapping_error(dma_addr_t dma_addr)
77{
78#ifdef CONFIG_X86_32
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85
86const struct dma_mapping_ops nommu_dma_ops = {
87 .map_single = nommu_map_single, 76 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 77 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 78 .is_phys = 1,
91}; 79};
92 80
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..7fc4d5b0a6a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
18 19
19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
20{ 21{
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 328
327static int __init idle_setup(char *str) 329static int __init idle_setup(char *str)
328{ 330{
331 if (!str)
332 return -EINVAL;
333
329 if (!strcmp(str, "poll")) { 334 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 335 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 336 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..53bc653ed5ca 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -128,7 +128,7 @@ void cpu_idle(void)
128 128
129 /* endless idle loop with no priority at all */ 129 /* endless idle loop with no priority at all */
130 while (1) { 130 while (1) {
131 tick_nohz_stop_sched_tick(); 131 tick_nohz_stop_sched_tick(1);
132 while (!need_resched()) { 132 while (!need_resched()) {
133 133
134 check_pgt_cache(); 134 check_pgt_cache();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..3fb62a7d9a16 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -120,7 +120,7 @@ void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */ 121 /* endless idle loop with no priority at all */
122 while (1) { 122 while (1) {
123 tick_nohz_stop_sched_tick(); 123 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) { 124 while (!need_resched()) {
125 125
126 rmb(); 126 rmb();
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..e37dccce85db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..06a9f643817e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
@@ -403,24 +411,28 @@ void native_machine_shutdown(void)
403{ 411{
404 /* Stop the cpus and apics */ 412 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 414
408 /* The boot cpu is always logical cpu 0 */ 415 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 416 int reboot_cpu_id = 0;
417 cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
410 418
411#ifdef CONFIG_X86_32 419#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 420 /* See if there has been given a command line override */
413 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 421 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
414 cpu_online(reboot_cpu)) 422 cpu_online(reboot_cpu)) {
415 reboot_cpu_id = reboot_cpu; 423 reboot_cpu_id = reboot_cpu;
424 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
425 }
416#endif 426#endif
417 427
418 /* Make certain the cpu I'm about to reboot on is online */ 428 /* Make certain the cpu I'm about to reboot on is online */
419 if (!cpu_online(reboot_cpu_id)) 429 if (!cpu_online(reboot_cpu_id)) {
420 reboot_cpu_id = smp_processor_id(); 430 reboot_cpu_id = smp_processor_id();
431 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
432 }
421 433
422 /* Make certain I only run on the appropriate processor */ 434 /* Make certain I only run on the appropriate processor */
423 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); 435 set_cpus_allowed_ptr(current, newmask);
424 436
425 /* O.K Now that I'm on the appropriate processor, 437 /* O.K Now that I'm on the appropriate processor,
426 * stop all of the others. 438 * stop all of the others.
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..703310a99023 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
24 * used to save some data for jumping back
25 */
26#define DATA(offset) (PAGE_SIZE/2+(offset))
27
28/* Minimal CPU state */
29#define ESP DATA(0x0)
30#define CR0 DATA(0x4)
31#define CR3 DATA(0x8)
32#define CR4 DATA(0xc)
33
34/* other data */
35#define CP_VA_CONTROL_PAGE DATA(0x10)
36#define CP_PA_PGD DATA(0x14)
37#define CP_PA_SWAP_PAGE DATA(0x18)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
39
23 .text 40 .text
24 .align PAGE_SIZE 41 .align PAGE_SIZE
25 .globl relocate_kernel 42 .globl relocate_kernel
26relocate_kernel: 43relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 44 /* Save the CPU context, used for jumping back */
45
46 pushl %ebx
47 pushl %esi
48 pushl %edi
49 pushl %ebp
50 pushf
51
52 movl 20+8(%esp), %ebp /* list of pages */
53 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
54 movl %esp, ESP(%edi)
55 movl %cr0, %eax
56 movl %eax, CR0(%edi)
57 movl %cr3, %eax
58 movl %eax, CR3(%edi)
59 movl %cr4, %eax
60 movl %eax, CR4(%edi)
28 61
29#ifdef CONFIG_X86_PAE 62#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 63 /* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
138 171
139relocate_new_kernel: 172relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 173 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 174 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 175 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 176 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 177 movl 20+16(%esp), %ecx /* cpu_has_pae */
178 movl 20+20(%esp), %esi /* preserve_context */
145 179
146 /* zero out flags, and disable interrupts */ 180 /* zero out flags, and disable interrupts */
147 pushl $0 181 pushl $0
148 popfl 182 popfl
149 183
184 /* save some information for jumping back */
185 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
186 movl %edi, CP_VA_CONTROL_PAGE(%edi)
187 movl PTR(PA_PGD)(%ebp), %eax
188 movl %eax, CP_PA_PGD(%edi)
189 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
190 movl %eax, CP_PA_SWAP_PAGE(%edi)
191 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
192
150 /* get physical address of control page now */ 193 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 194 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 195 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
197 xorl %eax, %eax 240 xorl %eax, %eax
198 movl %eax, %cr3 241 movl %eax, %cr3
199 242
243 movl CP_PA_SWAP_PAGE(%edi), %eax
244 pushl %eax
245 pushl %ebx
246 call swap_pages
247 addl $8, %esp
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB, it's handy, and not processor dependent.
252 */
253 xorl %eax, %eax
254 movl %eax, %cr3
255
256 /* set all of the registers to known values */
257 /* leave %esp alone */
258
259 testl %esi, %esi
260 jnz 1f
261 xorl %edi, %edi
262 xorl %eax, %eax
263 xorl %ebx, %ebx
264 xorl %ecx, %ecx
265 xorl %edx, %edx
266 xorl %esi, %esi
267 xorl %ebp, %ebp
268 ret
2691:
270 popl %edx
271 movl CP_PA_SWAP_PAGE(%edi), %esp
272 addl $PAGE_SIZE, %esp
2732:
274 call *%edx
275
276 /* get the re-entry point of the peer system */
277 movl 0(%esp), %ebp
278 call 1f
2791:
280 popl %ebx
281 subl $(1b - relocate_kernel), %ebx
282 movl CP_VA_CONTROL_PAGE(%ebx), %edi
283 lea PAGE_SIZE(%ebx), %esp
284 movl CP_PA_SWAP_PAGE(%ebx), %eax
285 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
286 pushl %eax
287 pushl %edx
288 call swap_pages
289 addl $8, %esp
290 movl CP_PA_PGD(%ebx), %eax
291 movl %eax, %cr3
292 movl %cr0, %eax
293 orl $(1<<31), %eax
294 movl %eax, %cr0
295 lea PAGE_SIZE(%edi), %esp
296 movl %edi, %eax
297 addl $(virtual_mapped - relocate_kernel), %eax
298 pushl %eax
299 ret
300
301virtual_mapped:
302 movl CR4(%edi), %eax
303 movl %eax, %cr4
304 movl CR3(%edi), %eax
305 movl %eax, %cr3
306 movl CR0(%edi), %eax
307 movl %eax, %cr0
308 movl ESP(%edi), %esp
309 movl %ebp, %eax
310
311 popf
312 popl %ebp
313 popl %edi
314 popl %esi
315 popl %ebx
316 ret
317
200 /* Do the copies */ 318 /* Do the copies */
201 movl %ebx, %ecx 319swap_pages:
320 movl 8(%esp), %edx
321 movl 4(%esp), %ecx
322 pushl %ebp
323 pushl %ebx
324 pushl %edi
325 pushl %esi
326 movl %ecx, %ebx
202 jmp 1f 327 jmp 1f
203 328
2040: /* top, read another word from the indirection page */ 3290: /* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 351 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 352 andl $0xfffff000, %esi
228 353
354 movl %edi, %eax
355 movl %esi, %ebp
356
357 movl %edx, %edi
229 movl $1024, %ecx 358 movl $1024, %ecx
230 rep ; movsl 359 rep ; movsl
231 jmp 0b
232 360
2333: 361 movl %ebp, %edi
234 362 movl %eax, %esi
235 /* To be certain of avoiding problems with self-modifying code 363 movl $1024, %ecx
236 * I need to execute a serializing instruction here. 364 rep ; movsl
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241 365
242 /* set all of the registers to known values */ 366 movl %eax, %edi
243 /* leave %esp alone */ 367 movl %edx, %esi
368 movl $1024, %ecx
369 rep ; movsl
244 370
245 xorl %eax, %eax 371 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 372 jmp 0b
247 xorl %ecx, %ecx 3733:
248 xorl %edx, %edx 374 popl %esi
249 xorl %esi, %esi 375 popl %edi
250 xorl %edi, %edi 376 popl %ebx
251 xorl %ebp, %ebp 377 popl %ebp
252 ret 378 ret
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..b520dae02bf4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 574early_param("elfcorehdr", setup_elfcorehdr);
580#endif 575#endif
581 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
582/* 581/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 582 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 583 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +597,11 @@ void __init setup_arch(char **cmdline_p)
598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect(); 598 visws_early_detect();
600 pre_setup_arch_hook(); 599 pre_setup_arch_hook();
601 early_cpu_init();
602#else 600#else
603 printk(KERN_INFO "Command line: %s\n", boot_command_line); 601 printk(KERN_INFO "Command line: %s\n", boot_command_line);
604#endif 602#endif
605 603
604 early_cpu_init();
606 early_ioremap_init(); 605 early_ioremap_init();
607 606
608 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 607 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,9 +665,6 @@ void __init setup_arch(char **cmdline_p)
666 bss_resource.start = virt_to_phys(&__bss_start); 665 bss_resource.start = virt_to_phys(&__bss_start);
667 bss_resource.end = virt_to_phys(&__bss_stop)-1; 666 bss_resource.end = virt_to_phys(&__bss_stop)-1;
668 667
669#ifdef CONFIG_X86_64
670 early_cpu_init();
671#endif
672 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
673 *cmdline_p = command_line; 669 *cmdline_p = command_line;
674 670
@@ -681,7 +677,7 @@ void __init setup_arch(char **cmdline_p)
681#ifdef CONFIG_X86_LOCAL_APIC 677#ifdef CONFIG_X86_LOCAL_APIC
682 disable_apic = 1; 678 disable_apic = 1;
683#endif 679#endif
684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 680 setup_clear_cpu_cap(X86_FEATURE_APIC);
685 } 681 }
686 682
687#ifdef CONFIG_PCI 683#ifdef CONFIG_PCI
@@ -824,7 +820,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 820 vmi_init();
825#endif 821#endif
826 822
823 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 824 paging_init();
825 paravirt_pagetable_setup_done(swapper_pg_dir);
826 paravirt_post_allocator_init();
828 827
829#ifdef CONFIG_X86_64 828#ifdef CONFIG_X86_64
830 map_vsyscall(); 829 map_vsyscall();
@@ -854,14 +853,6 @@ void __init setup_arch(char **cmdline_p)
854 init_cpu_to_node(); 853 init_cpu_to_node();
855#endif 854#endif
856 855
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 856 init_apic_mappings();
866 ioapic_init_mappings(); 857 ioapic_init_mappings();
867 858
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..f7745f94c006 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -227,8 +227,8 @@ static void __init setup_node_to_cpumask_map(void)
227 /* allocate the map */ 227 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229 229
230 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 230 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids); 231 map, nr_node_ids);
232 232
233 /* node_to_cpumask() will now work */ 233 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map; 234 node_to_cpumask_map = map;
@@ -248,7 +248,7 @@ void __cpuinit numa_set_node(int cpu, int node)
248 per_cpu(x86_cpu_to_node_map, cpu) = node; 248 per_cpu(x86_cpu_to_node_map, cpu) = node;
249 249
250 else 250 else
251 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); 251 pr_debug("Setting node for non-present cpu %d\n", cpu);
252} 252}
253 253
254void __cpuinit numa_clear_node(int cpu) 254void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..6fb5bcdd8933 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
669 663
670 if (thread_info_flags & _TIF_HRTICK_RESCHED)
671 hrtick_resched();
672
673 clear_thread_flag(TIF_IRET); 664 clear_thread_flag(TIF_IRET);
674} 665}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..b45ef8ddd651 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 53 return do_sigaltstack(uss, uoss, regs->sp);
54} 54}
55 55
56/*
57 * Signal frame handlers.
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87}
88
89/*
90 * This restores directly out of user space. Exceptions are handled.
91 */
92static inline int restore_i387(struct _fpstate __user *buf)
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108}
56 109
57/* 110/*
58 * Do a signal return; undo the signal stack. 111 * Do a signal return; undo the signal stack.
@@ -487,12 +540,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 540void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 541 __u32 thread_info_flags)
489{ 542{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 543#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 544 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 545 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -502,9 +549,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
502 /* deal with pending signal delivery */ 549 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 550 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 551 do_signal(regs);
505
506 if (thread_info_flags & _TIF_HRTICK_RESCHED)
507 hrtick_resched();
508} 552}
509 553
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 554void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..332512767f4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void)
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
217 phys_id, cpuid); 217 phys_id, cpuid);
218 } 218 }
219 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 219 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
220 220
221 /* 221 /*
222 * STARTUP IPIs are fragile beasts as they might sometimes 222 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void)
251 * boards) 251 * boards)
252 */ 252 */
253 253
254 Dprintk("CALLIN, before setup_local_APIC().\n"); 254 pr_debug("CALLIN, before setup_local_APIC().\n");
255 smp_callin_clear_local_apic(); 255 smp_callin_clear_local_apic();
256 setup_local_APIC(); 256 setup_local_APIC();
257 end_local_APIC_setup(); 257 end_local_APIC_setup();
@@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void)
266 local_irq_enable(); 266 local_irq_enable();
267 calibrate_delay(); 267 calibrate_delay();
268 local_irq_disable(); 268 local_irq_disable();
269 Dprintk("Stack at about %p\n", &cpuid); 269 pr_debug("Stack at about %p\n", &cpuid);
270 270
271 /* 271 /*
272 * Save our processor parameters 272 * Save our processor parameters
@@ -438,7 +438,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 438 cpu_set(cpu, cpu_sibling_setup_map);
439 439
440 if (smp_num_siblings > 1) { 440 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 441 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 442 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 443 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 444 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +461,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 461 return;
462 } 462 }
463 463
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 464 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 467 cpu_set(i, c->llc_shared_map);
@@ -513,7 +513,7 @@ static void impress_friends(void)
513 /* 513 /*
514 * Allow the user to impress friends. 514 * Allow the user to impress friends.
515 */ 515 */
516 Dprintk("Before bogomips.\n"); 516 pr_debug("Before bogomips.\n");
517 for_each_possible_cpu(cpu) 517 for_each_possible_cpu(cpu)
518 if (cpu_isset(cpu, cpu_callout_map)) 518 if (cpu_isset(cpu, cpu_callout_map))
519 bogosum += cpu_data(cpu).loops_per_jiffy; 519 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +523,7 @@ static void impress_friends(void)
523 bogosum/(500000/HZ), 523 bogosum/(500000/HZ),
524 (bogosum/(5000/HZ))%100); 524 (bogosum/(5000/HZ))%100);
525 525
526 Dprintk("Before bogocount - setting activated=1.\n"); 526 pr_debug("Before bogocount - setting activated=1.\n");
527} 527}
528 528
529static inline void __inquire_remote_apic(int apicid) 529static inline void __inquire_remote_apic(int apicid)
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 546 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 547 "a previous APIC delivery may have failed\n");
548 548
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 551
552 timeout = 0; 552 timeout = 0;
553 do { 553 do {
@@ -579,29 +579,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 579 int maxlvt;
580 580
581 /* Target chip */ 581 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583 583
584 /* Boot on the stack */ 584 /* Boot on the stack */
585 /* Kick the second */ 585 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
587 587
588 Dprintk("Waiting for send to finish...\n"); 588 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 589 send_status = safe_apic_wait_icr_idle();
590 590
591 /* 591 /*
592 * Give the other CPU some time to accept the IPI. 592 * Give the other CPU some time to accept the IPI.
593 */ 593 */
594 udelay(200); 594 udelay(200);
595 /*
596 * Due to the Pentium erratum 3AP.
597 */
598 maxlvt = lapic_get_maxlvt(); 595 maxlvt = lapic_get_maxlvt();
599 if (maxlvt > 3) { 596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0); 597 apic_write(APIC_ESR, 0);
602 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 598 accept_status = (apic_read(APIC_ESR) & 0xEF);
604 Dprintk("NMI sent.\n"); 599 pr_debug("NMI sent.\n");
605 600
606 if (send_status) 601 if (send_status)
607 printk(KERN_ERR "APIC never delivered???\n"); 602 printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +620,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 620 return send_status;
626 } 621 }
627 622
623 maxlvt = lapic_get_maxlvt();
624
628 /* 625 /*
629 * Be paranoid about clearing APIC errors. 626 * Be paranoid about clearing APIC errors.
630 */ 627 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 628 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 629 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 630 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 631 apic_read(APIC_ESR);
635 } 632 }
636 633
637 Dprintk("Asserting INIT.\n"); 634 pr_debug("Asserting INIT.\n");
638 635
639 /* 636 /*
640 * Turn INIT on target chip 637 * Turn INIT on target chip
641 */ 638 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 640
644 /* 641 /*
645 * Send IPI 642 * Send IPI
646 */ 643 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 644 apic_write(APIC_ICR,
648 | APIC_DM_INIT); 645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
649 646
650 Dprintk("Waiting for send to finish...\n"); 647 pr_debug("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 648 send_status = safe_apic_wait_icr_idle();
652 649
653 mdelay(10); 650 mdelay(10);
654 651
655 Dprintk("Deasserting INIT.\n"); 652 pr_debug("Deasserting INIT.\n");
656 653
657 /* Target chip */ 654 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659 656
660 /* Send IPI */ 657 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
662 659
663 Dprintk("Waiting for send to finish...\n"); 660 pr_debug("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
665 662
666 mb(); 663 mb();
@@ -687,55 +684,47 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
687 /* 684 /*
688 * Run STARTUP IPI loop. 685 * Run STARTUP IPI loop.
689 */ 686 */
690 Dprintk("#startup loops: %d.\n", num_starts); 687 pr_debug("#startup loops: %d.\n", num_starts);
691
692 maxlvt = lapic_get_maxlvt();
693 688
694 for (j = 1; j <= num_starts; j++) { 689 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 690 pr_debug("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 691 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 692 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 693 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 694 pr_debug("After apic_write.\n");
700 695
701 /* 696 /*
702 * STARTUP IPI 697 * STARTUP IPI
703 */ 698 */
704 699
705 /* Target chip */ 700 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707 702
708 /* Boot on the stack */ 703 /* Boot on the stack */
709 /* Kick the second */ 704 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
711 | (start_eip >> 12));
712 706
713 /* 707 /*
714 * Give the other CPU some time to accept the IPI. 708 * Give the other CPU some time to accept the IPI.
715 */ 709 */
716 udelay(300); 710 udelay(300);
717 711
718 Dprintk("Startup point 1.\n"); 712 pr_debug("Startup point 1.\n");
719 713
720 Dprintk("Waiting for send to finish...\n"); 714 pr_debug("Waiting for send to finish...\n");
721 send_status = safe_apic_wait_icr_idle(); 715 send_status = safe_apic_wait_icr_idle();
722 716
723 /* 717 /*
724 * Give the other CPU some time to accept the IPI. 718 * Give the other CPU some time to accept the IPI.
725 */ 719 */
726 udelay(200); 720 udelay(200);
727 /* 721 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 722 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 723 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 724 if (send_status || accept_status)
736 break; 725 break;
737 } 726 }
738 Dprintk("After Startup.\n"); 727 pr_debug("After Startup.\n");
739 728
740 if (send_status) 729 if (send_status)
741 printk(KERN_ERR "APIC never delivered???\n"); 730 printk(KERN_ERR "APIC never delivered???\n");
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 757 *
769 * Must be called after the _cpu_pda pointer table is initialized. 758 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 759 */
771static int __cpuinit get_local_pda(int cpu) 760int __cpuinit get_local_pda(int cpu)
772{ 761{
773 struct x8664_pda *oldpda, *newpda; 762 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 763 unsigned long size = sizeof(struct x8664_pda);
@@ -886,7 +875,7 @@ do_rest:
886 875
887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 876 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
888 877
889 Dprintk("Setting warm reset code and vector.\n"); 878 pr_debug("Setting warm reset code and vector.\n");
890 879
891 store_NMI_vector(&nmi_high, &nmi_low); 880 store_NMI_vector(&nmi_high, &nmi_low);
892 881
@@ -907,9 +896,9 @@ do_rest:
907 /* 896 /*
908 * allow APs to start initializing. 897 * allow APs to start initializing.
909 */ 898 */
910 Dprintk("Before Callout %d.\n", cpu); 899 pr_debug("Before Callout %d.\n", cpu);
911 cpu_set(cpu, cpu_callout_map); 900 cpu_set(cpu, cpu_callout_map);
912 Dprintk("After Callout %d.\n", cpu); 901 pr_debug("After Callout %d.\n", cpu);
913 902
914 /* 903 /*
915 * Wait 5s total for a response 904 * Wait 5s total for a response
@@ -922,10 +911,10 @@ do_rest:
922 911
923 if (cpu_isset(cpu, cpu_callin_map)) { 912 if (cpu_isset(cpu, cpu_callin_map)) {
924 /* number CPUs logically, starting from 1 (BSP is 0) */ 913 /* number CPUs logically, starting from 1 (BSP is 0) */
925 Dprintk("OK.\n"); 914 pr_debug("OK.\n");
926 printk(KERN_INFO "CPU%d: ", cpu); 915 printk(KERN_INFO "CPU%d: ", cpu);
927 print_cpu_info(&cpu_data(cpu)); 916 print_cpu_info(&cpu_data(cpu));
928 Dprintk("CPU has booted.\n"); 917 pr_debug("CPU has booted.\n");
929 } else { 918 } else {
930 boot_error = 1; 919 boot_error = 1;
931 if (*((volatile unsigned char *)trampoline_base) 920 if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
970 959
971 WARN_ON(irqs_disabled()); 960 WARN_ON(irqs_disabled());
972 961
973 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 962 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
974 963
975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 964 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
976 !physid_isset(apicid, phys_cpu_present_map)) { 965 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
982 * Already booted CPU? 971 * Already booted CPU?
983 */ 972 */
984 if (cpu_isset(cpu, cpu_callin_map)) { 973 if (cpu_isset(cpu, cpu_callin_map)) {
985 Dprintk("do_boot_cpu %d Already started\n", cpu); 974 pr_debug("do_boot_cpu %d Already started\n", cpu);
986 return -ENOSYS; 975 return -ENOSYS;
987 } 976 }
988 977
@@ -1009,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1009 err = do_boot_cpu(apicid, cpu); 998 err = do_boot_cpu(apicid, cpu);
1010#endif 999#endif
1011 if (err) { 1000 if (err) {
1012 Dprintk("do_boot_cpu failed %d\n", err); 1001 pr_debug("do_boot_cpu failed %d\n", err);
1013 return -EIO; 1002 return -EIO;
1014 } 1003 }
1015 1004
@@ -1213,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void)
1213 1202
1214void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
1215{ 1204{
1216 Dprintk("Boot done.\n"); 1205 pr_debug("Boot done.\n");
1217 1206
1218 impress_friends(); 1207 impress_friends();
1219 smp_checks(); 1208 smp_checks();
@@ -1230,7 +1219,7 @@ static void remove_siblinginfo(int cpu)
1230 int sibling; 1219 int sibling;
1231 struct cpuinfo_x86 *c = &cpu_data(cpu); 1220 struct cpuinfo_x86 *c = &cpu_data(cpu);
1232 1221
1233 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { 1222 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1234 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1223 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1235 /*/ 1224 /*/
1236 * last thread sibling in this cpu core going down 1225 * last thread sibling in this cpu core going down
@@ -1239,7 +1228,7 @@ static void remove_siblinginfo(int cpu)
1239 cpu_data(sibling).booted_cores--; 1228 cpu_data(sibling).booted_cores--;
1240 } 1229 }
1241 1230
1242 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) 1231 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1243 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1232 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1244 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1233 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1245 cpus_clear(per_cpu(cpu_core_map, cpu)); 1234 cpus_clear(per_cpu(cpu_core_map, cpu));
@@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1311 cpu_clear(cpu, cpu_callout_map); 1300 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1301 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1302 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1303 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1304 numa_remove_cpu(cpu);
1316} 1305}
1317 1306
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
1390{ 1379{
1391 extern unsigned int maxcpus; 1380 extern unsigned int maxcpus;
1392 1381
1393 maxcpus = simple_strtoul(arg, NULL, 0); 1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0; 1384 return 0;
1395} 1385}
1396early_param("maxcpus", parse_maxcpus); 1386early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..ffe3c664afc0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
129 */ 129 */
130void __init time_init(void) 130void __init time_init(void)
131{ 131{
132 pre_time_init_hook();
132 tsc_init(); 133 tsc_init();
133 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
134} 135}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8a768973c4f0..03df8e45e5a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
58#include <asm/nmi.h> 58#include <asm/nmi.h>
59#include <asm/smp.h> 59#include <asm/smp.h>
60#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
61 62
62#include "mach_traps.h" 63#include "mach_traps.h"
63 64
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
77gate_desc idt_table[256] 78gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79 80
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi; 81int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
383 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
384} 365}
385 366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
386int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{ 416{
388 unsigned short ss; 417 unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
423 */ 452 */
424void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
425{ 454{
426 static struct { 455 unsigned long flags = oops_begin();
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449 456
450 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
451 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
452 459
453 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 } 464 }
458 465
459 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479} 467}
480 468
481static inline void 469static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2696a6837782..3f18d73f420c 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/pda.h> 53#include <asm/pda.h>
54#include <asm/traps.h>
54 55
55#include <mach_traps.h> 56#include <mach_traps.h>
56 57
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi; 58int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12; 59int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
355 .address = print_trace_address, 335 .address = print_trace_address,
356}; 336};
357 337
358void show_trace(struct task_struct *task, struct pt_regs *regs, 338static void
359 unsigned long *stack, unsigned long bp) 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
360{ 341{
361 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
363 printk("\n"); 344 printk("\n");
364} 345}
365 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
366static void 353static void
367_show_stack(struct task_struct *task, struct pt_regs *regs, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
369{ 356{
370 unsigned long *stack; 357 unsigned long *stack;
371 int i; 358 int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
399 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
400 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
401 } 388 }
402 show_trace(task, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
403} 390}
404 391
405void show_stack(struct task_struct *task, unsigned long *sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
406{ 393{
407 _show_stack(task, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
408} 395}
409 396
410/* 397/*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
454 u8 *ip; 441 u8 *ip;
455 442
456 printk("Stack: "); 443 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
458 printk("\n"); 446 printk("\n");
459 447
460 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
518} 506}
519 507
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{ 509{
522 die_owner = -1; 510 die_owner = -1;
523 bust_spinlocks(0); 511 bust_spinlocks(0);
524 die_nest_count--; 512 die_nest_count--;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..41e01b145c48 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 73 return visws_board_type >= 0;
74} 74}
75 75
76static int __init visws_time_init_quirk(void) 76static int __init visws_time_init(void)
77{ 77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 79
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 93 return 0;
94} 94}
95 95
96static int __init visws_pre_intr_init_quirk(void) 96static int __init visws_pre_intr_init(void)
97{ 97{
98 init_VISWS_APIC_irqs(); 98 init_VISWS_APIC_irqs();
99 99
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 114
115long long mem_size __initdata = 0; 115long long mem_size __initdata = 0;
116 116
117static char * __init visws_memory_setup_quirk(void) 117static char * __init visws_memory_setup(void)
118{ 118{
119 long long gfx_mem_size = 8 * MB; 119 long long gfx_mem_size = 8 * MB;
120 120
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 177}
178 178
179static int __init visws_get_smp_config_quirk(unsigned int early) 179static int __init visws_get_smp_config(unsigned int early)
180{ 180{
181 /* 181 /*
182 * Prevent MP-table parsing by the generic code: 182 * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
192 * No problem for Linux. 192 * No problem for Linux.
193 */ 193 */
194 194
195static void __init MP_processor_info (struct mpc_config_processor *m) 195static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 196{
197 int ver, logical_apicid; 197 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 198 physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 232 apic_version[m->mpc_apicid] = ver;
233} 233}
234 234
235int __init visws_find_smp_config_quirk(unsigned int reserve) 235static int __init visws_find_smp_config(unsigned int reserve)
236{ 236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 258 return 1;
259} 259}
260 260
261extern int visws_trap_init_quirk(void); 261static int visws_trap_init(void);
262
263static struct x86_quirks visws_x86_quirks __initdata = {
264 .arch_time_init = visws_time_init,
265 .arch_pre_intr_init = visws_pre_intr_init,
266 .arch_memory_setup = visws_memory_setup,
267 .arch_intr_init = NULL,
268 .arch_trap_init = visws_trap_init,
269 .mach_get_smp_config = visws_get_smp_config,
270 .mach_find_smp_config = visws_find_smp_config,
271};
262 272
263void __init visws_early_detect(void) 273void __init visws_early_detect(void)
264{ 274{
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
272 282
273 /* 283 /*
274 * Install special quirks for timer, interrupt and memory setup: 284 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 285 * Fall back to generic behavior for traps:
286 * Override generic MP-table parsing:
282 */ 287 */
283 arch_intr_init_quirk = NULL; 288 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 289
286 /* 290 /*
287 * Install reboot quirks: 291 * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
294 */ 298 */
295 no_broadcast = 0; 299 no_broadcast = 0;
296 300
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 301#ifdef CONFIG_X86_IO_APIC
304 /* 302 /*
305 * Turn off IO-APIC detection and initialization: 303 * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 424 co_apic_read(CO_APIC_ID));
427} 425}
428 426
429int __init visws_trap_init_quirk(void) 427static int __init visws_trap_init(void)
430{ 428{
431 lithium_init(); 429 lithium_init();
432 cobalt_init(); 430 cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..0a1b1a9d922d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 906#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 907 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 908 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 909#endif
911 910
912 /* 911 /*
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218db..d0e940bb6f40 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,8 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663f..c0f7872a9124 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,19 +193,16 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
208 }
209 } 206 }
210 207
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -308,6 +305,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
308 create_pit_timer(&ps->pit_timer, val, 0); 305 create_pit_timer(&ps->pit_timer, val, 0);
309 break; 306 break;
310 case 2: 307 case 2:
308 case 3:
311 create_pit_timer(&ps->pit_timer, val, 1); 309 create_pit_timer(&ps->pit_timer, val, 1);
312 break; 310 break;
313 default: 311 default:
@@ -459,7 +457,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 457 mutex_unlock(&pit_state->lock);
460} 458}
461 459
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 460static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
461 int len, int is_write)
463{ 462{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 463 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 464 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +499,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 499 mutex_unlock(&pit_state->lock);
501} 500}
502 501
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 502static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
503 int len, int is_write)
504{ 504{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 505 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 506}
@@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm)
575 } 575 }
576} 576}
577 577
578void __inject_pit_timer_intr(struct kvm *kvm) 578static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 579{
580 mutex_lock(&kvm->lock); 580 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def47..c31164e8aa46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 130{
131 struct kvm_pic *s = opaque; 131 struct kvm_pic *s = opaque;
132 132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 133 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 134 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
135 pic_update_irq(s);
136 }
135} 137}
136 138
137/* 139/*
@@ -346,7 +348,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 348 return s->elcr;
347} 349}
348 350
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 351static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
352 int len, int is_write)
350{ 353{
351 switch (addr) { 354 switch (addr) {
352 case 0x20: 355 case 0x20:
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c0..7ca47cbb48bb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae162..73f43de69f67 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 358 break;
359
359 case APIC_DM_NMI: 360 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 361 kvm_inject_nmi(vcpu);
361 break; 362 break;
362 363
363 case APIC_DM_INIT: 364 case APIC_DM_INIT:
@@ -572,6 +573,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 573{
573 u32 val = 0; 574 u32 val = 0;
574 575
576 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
577
575 if (offset >= LAPIC_MMIO_LENGTH) 578 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 579 return 0;
577 580
@@ -695,6 +698,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 698
696 offset &= 0xff0; 699 offset &= 0xff0;
697 700
701 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
702
698 switch (offset) { 703 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 704 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 705 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 785
781} 786}
782 787
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 788static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
789 int len, int size)
784{ 790{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 791 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 792 int ret = 0;
@@ -939,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 945 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 946 wait_queue_head_t *q = &apic->vcpu->wq;
941 947
942 atomic_inc(&apic->timer.pending); 948 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 950 if (waitqueue_active(q)) {
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 952 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9cee..81858881287e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..2fa231923cf7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
72#ifndef MMU_DEBUG 73#ifndef MMU_DEBUG
@@ -776,6 +777,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 777 BUG();
777} 778}
778 779
780static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
781 struct kvm_mmu_page *sp)
782{
783 int i;
784
785 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
786 sp->spt[i] = shadow_trap_nonpresent_pte;
787}
788
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 789static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 790{
781 unsigned index; 791 unsigned index;
@@ -841,7 +851,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
841 hlist_add_head(&sp->hash_link, bucket); 851 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 852 if (!metaphysical)
843 rmap_write_protect(vcpu->kvm, gfn); 853 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 854 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
855 vcpu->arch.mmu.prefetch_page(vcpu, sp);
856 else
857 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 858 return sp;
846} 859}
847 860
@@ -917,14 +930,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
917 } 930 }
918 kvm_mmu_page_unlink_children(kvm, sp); 931 kvm_mmu_page_unlink_children(kvm, sp);
919 if (!sp->root_count) { 932 if (!sp->root_count) {
920 if (!sp->role.metaphysical) 933 if (!sp->role.metaphysical && !sp->role.invalid)
921 unaccount_shadowed(kvm, sp->gfn); 934 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 935 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 936 kvm_mmu_free_page(kvm, sp);
924 } else { 937 } else {
938 int invalid = sp->role.invalid;
925 list_move(&sp->link, &kvm->arch.active_mmu_pages); 939 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 940 sp->role.invalid = 1;
927 kvm_reload_remote_mmus(kvm); 941 kvm_reload_remote_mmus(kvm);
942 if (!sp->role.metaphysical && !invalid)
943 unaccount_shadowed(kvm, sp->gfn);
928 } 944 }
929 kvm_mmu_reset_last_pte_updated(kvm); 945 kvm_mmu_reset_last_pte_updated(kvm);
930} 946}
@@ -1103,7 +1119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1103 mark_page_dirty(vcpu->kvm, gfn); 1119 mark_page_dirty(vcpu->kvm, gfn);
1104 1120
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1121 pgprintk("%s: setting spte %llx\n", __func__, spte);
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", 1122 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", 1123 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); 1124 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1125 set_shadow_pte(shadow_pte, spte);
@@ -1122,8 +1138,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1138 else
1123 kvm_release_pfn_clean(pfn); 1139 kvm_release_pfn_clean(pfn);
1124 } 1140 }
1125 if (!ptwrite || !*ptwrite) 1141 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1142 vcpu->arch.last_pte_updated = shadow_pte;
1143 vcpu->arch.last_pte_gfn = gfn;
1144 }
1127} 1145}
1128 1146
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1147static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1171,9 +1189,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1171 return -ENOMEM; 1189 return -ENOMEM;
1172 } 1190 }
1173 1191
1174 table[index] = __pa(new_table->spt) 1192 set_shadow_pte(&table[index],
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1193 __pa(new_table->spt)
1176 | shadow_user_mask | shadow_x_mask; 1194 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1195 | shadow_user_mask | shadow_x_mask);
1177 } 1196 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK; 1197 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1198 }
@@ -1211,15 +1230,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1211} 1230}
1212 1231
1213 1232
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1215 struct kvm_mmu_page *sp)
1216{
1217 int i;
1218
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1220 sp->spt[i] = shadow_trap_nonpresent_pte;
1221}
1222
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1233static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1234{
1225 int i; 1235 int i;
@@ -1671,6 +1681,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 1681 vcpu->arch.update_pte.pfn = pfn;
1672} 1682}
1673 1683
1684static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1685{
1686 u64 *spte = vcpu->arch.last_pte_updated;
1687
1688 if (spte
1689 && vcpu->arch.last_pte_gfn == gfn
1690 && shadow_accessed_mask
1691 && !(*spte & shadow_accessed_mask)
1692 && is_shadow_present_pte(*spte))
1693 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1694}
1695
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1696void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 1697 const u8 *new, int bytes)
1676{ 1698{
@@ -1694,6 +1716,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 1716 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1717 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 1718 spin_lock(&vcpu->kvm->mmu_lock);
1719 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 1720 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 1721 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 1722 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1791,6 +1814,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1791 spin_unlock(&vcpu->kvm->mmu_lock); 1814 spin_unlock(&vcpu->kvm->mmu_lock);
1792 return r; 1815 return r;
1793} 1816}
1817EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1794 1818
1795void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 1819void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1796{ 1820{
@@ -1847,6 +1871,12 @@ void kvm_enable_tdp(void)
1847} 1871}
1848EXPORT_SYMBOL_GPL(kvm_enable_tdp); 1872EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1849 1873
1874void kvm_disable_tdp(void)
1875{
1876 tdp_enabled = false;
1877}
1878EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1879
1850static void free_mmu_pages(struct kvm_vcpu *vcpu) 1880static void free_mmu_pages(struct kvm_vcpu *vcpu)
1851{ 1881{
1852 struct kvm_mmu_page *sp; 1882 struct kvm_mmu_page *sp;
@@ -1948,7 +1978,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1948 kvm_flush_remote_tlbs(kvm); 1978 kvm_flush_remote_tlbs(kvm);
1949} 1979}
1950 1980
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 1981static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 1982{
1953 struct kvm_mmu_page *page; 1983 struct kvm_mmu_page *page;
1954 1984
@@ -1968,6 +1998,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 1998 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 1999 int npages;
1970 2000
2001 if (!down_read_trylock(&kvm->slots_lock))
2002 continue;
1971 spin_lock(&kvm->mmu_lock); 2003 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 2004 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 2005 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2012,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2012 nr_to_scan--;
1981 2013
1982 spin_unlock(&kvm->mmu_lock); 2014 spin_unlock(&kvm->mmu_lock);
2015 up_read(&kvm->slots_lock);
1983 } 2016 }
1984 if (kvm_freed) 2017 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2018 list_move_tail(&kvm_freed->vm_list, &vm_list);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7a..258e5d56298e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b619396..4d918220baeb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -460,8 +460,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 461 struct kvm_mmu_page *sp)
462{ 462{
463 int i, offset = 0, r = 0; 463 int i, j, offset, r;
464 pt_element_t pt; 464 pt_element_t pt[256 / sizeof(pt_element_t)];
465 gpa_t pte_gpa;
465 466
466 if (sp->role.metaphysical 467 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 468 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,19 +470,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 470 return;
470 } 471 }
471 472
472 if (PTTYPE == 32) 473 pte_gpa = gfn_to_gpa(sp->gfn);
474 if (PTTYPE == 32) {
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 475 offset = sp->role.quadrant << PT64_LEVEL_BITS;
476 pte_gpa += offset * sizeof(pt_element_t);
477 }
474 478
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 479 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 480 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 481 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
478 482 for (j = 0; j < ARRAY_SIZE(pt); ++j)
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 483 if (r || is_present_pte(pt[j]))
480 sizeof(pt_element_t)); 484 sp->spt[i+j] = shadow_trap_nonpresent_pte;
481 if (r || is_present_pte(pt)) 485 else
482 sp->spt[i] = shadow_trap_nonpresent_pte; 486 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
483 else
484 sp->spt[i] = shadow_notrap_nonpresent_pte;
485 } 487 }
486} 488}
487 489
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab3..e2ee264740c7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -27,6 +27,8 @@
27 27
28#include <asm/desc.h> 28#include <asm/desc.h>
29 29
30#define __ex(x) __kvm_handle_fault_on_reboot(x)
31
30MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32 34
@@ -129,17 +131,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 131
130static inline void clgi(void) 132static inline void clgi(void)
131{ 133{
132 asm volatile (SVM_CLGI); 134 asm volatile (__ex(SVM_CLGI));
133} 135}
134 136
135static inline void stgi(void) 137static inline void stgi(void)
136{ 138{
137 asm volatile (SVM_STGI); 139 asm volatile (__ex(SVM_STGI));
138} 140}
139 141
140static inline void invlpga(unsigned long addr, u32 asid) 142static inline void invlpga(unsigned long addr, u32 asid)
141{ 143{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 144 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 145}
144 146
145static inline unsigned long kvm_read_cr2(void) 147static inline unsigned long kvm_read_cr2(void)
@@ -270,19 +272,11 @@ static int has_svm(void)
270 272
271static void svm_hardware_disable(void *garbage) 273static void svm_hardware_disable(void *garbage)
272{ 274{
273 struct svm_cpu_data *svm_data 275 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 276
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 277 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 278 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 279 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 280}
287 281
288static void svm_hardware_enable(void *garbage) 282static void svm_hardware_enable(void *garbage)
@@ -321,6 +315,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 316}
323 317
318static void svm_cpu_uninit(int cpu)
319{
320 struct svm_cpu_data *svm_data
321 = per_cpu(svm_data, raw_smp_processor_id());
322
323 if (!svm_data)
324 return;
325
326 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
327 __free_page(svm_data->save_area);
328 kfree(svm_data);
329}
330
324static int svm_cpu_init(int cpu) 331static int svm_cpu_init(int cpu)
325{ 332{
326 struct svm_cpu_data *svm_data; 333 struct svm_cpu_data *svm_data;
@@ -446,7 +453,8 @@ static __init int svm_hardware_setup(void)
446 if (npt_enabled) { 453 if (npt_enabled) {
447 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 454 printk(KERN_INFO "kvm: Nested Paging enabled\n");
448 kvm_enable_tdp(); 455 kvm_enable_tdp();
449 } 456 } else
457 kvm_disable_tdp();
450 458
451 return 0; 459 return 0;
452 460
@@ -458,6 +466,11 @@ err:
458 466
459static __exit void svm_hardware_unsetup(void) 467static __exit void svm_hardware_unsetup(void)
460{ 468{
469 int cpu;
470
471 for_each_online_cpu(cpu)
472 svm_cpu_uninit(cpu);
473
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 474 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 475 iopm_base = 0;
463} 476}
@@ -707,10 +720,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 720 rdtscll(vcpu->arch.host_tsc);
708} 721}
709 722
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu) 723static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{ 724{
716 struct vcpu_svm *svm = to_svm(vcpu); 725 struct vcpu_svm *svm = to_svm(vcpu);
@@ -949,7 +958,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 958
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 959static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 960{
952 return to_svm(vcpu)->db_regs[dr]; 961 unsigned long val = to_svm(vcpu)->db_regs[dr];
962 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
963 return val;
953} 964}
954 965
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 966static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -997,13 +1008,28 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
997 struct kvm *kvm = svm->vcpu.kvm; 1008 struct kvm *kvm = svm->vcpu.kvm;
998 u64 fault_address; 1009 u64 fault_address;
999 u32 error_code; 1010 u32 error_code;
1011 bool event_injection = false;
1000 1012
1001 if (!irqchip_in_kernel(kvm) && 1013 if (!irqchip_in_kernel(kvm) &&
1002 is_external_interrupt(exit_int_info)) 1014 is_external_interrupt(exit_int_info)) {
1015 event_injection = true;
1003 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1016 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1017 }
1004 1018
1005 fault_address = svm->vmcb->control.exit_info_2; 1019 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1020 error_code = svm->vmcb->control.exit_info_1;
1021
1022 if (!npt_enabled)
1023 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1024 (u32)fault_address, (u32)(fault_address >> 32),
1025 handler);
1026 else
1027 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1028 (u32)fault_address, (u32)(fault_address >> 32),
1029 handler);
1030
1031 if (event_injection)
1032 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1033 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1034}
1009 1035
@@ -1081,6 +1107,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1107 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1108}
1083 1109
1110static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1111{
1112 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1113 return 1;
1114}
1115
1116static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1117{
1118 ++svm->vcpu.stat.irq_exits;
1119 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1120 return 1;
1121}
1122
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1124{
1086 return 1; 1125 return 1;
@@ -1219,6 +1258,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1258 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1259 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1260 else {
1261 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1262 (u32)(data >> 32), handler);
1263
1222 svm->vmcb->save.rax = data & 0xffffffff; 1264 svm->vmcb->save.rax = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1265 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1266 svm->next_rip = svm->vmcb->save.rip + 2;
@@ -1284,16 +1326,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1326 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1327 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1328 case MSR_K7_EVNTSEL3:
1329 case MSR_K7_PERFCTR0:
1330 case MSR_K7_PERFCTR1:
1331 case MSR_K7_PERFCTR2:
1332 case MSR_K7_PERFCTR3:
1287 /* 1333 /*
1288 * only support writing 0 to the performance counters for now 1334 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1335 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1336 * happy
1291 */ 1337 */
1292 if (data != 0) 1338 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1339
1294 break; 1340 break;
1295 default: 1341 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1342 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1343 }
1299 return 0; 1344 return 0;
@@ -1304,6 +1349,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1349 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1350 u64 data = (svm->vmcb->save.rax & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1351 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1352
1353 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1354 handler);
1355
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1356 svm->next_rip = svm->vmcb->save.rip + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1357 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1358 kvm_inject_gp(&svm->vcpu, 0);
@@ -1323,6 +1372,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1372static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1373 struct kvm_run *kvm_run)
1325{ 1374{
1375 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1376
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1377 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1378 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1379 /*
@@ -1364,8 +1415,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1415 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1416 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1417 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1418 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1419 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1420 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1421 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1422 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1397,6 +1448,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1448 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1449 u32 exit_code = svm->vmcb->control.exit_code;
1399 1450
1451 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1452 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1453
1400 if (npt_enabled) { 1454 if (npt_enabled) {
1401 int mmu_reload = 0; 1455 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1456 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1524,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1524{
1471 struct vmcb_control_area *control; 1525 struct vmcb_control_area *control;
1472 1526
1527 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1528
1473 control = &svm->vmcb->control; 1529 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1530 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1531 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1660,9 +1716,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660 sync_lapic_to_cr8(vcpu); 1716 sync_lapic_to_cr8(vcpu);
1661 1717
1662 save_host_msrs(vcpu); 1718 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1719 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1720 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1721 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1722 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1723 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1724 svm->host_dr7 = read_dr7();
@@ -1716,17 +1772,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1716 /* Enter guest mode */ 1772 /* Enter guest mode */
1717 "push %%rax \n\t" 1773 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t" 1774 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1720 SVM_VMRUN "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1721 SVM_VMSAVE "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1722 "pop %%rax \n\t" 1778 "pop %%rax \n\t"
1723#else 1779#else
1724 /* Enter guest mode */ 1780 /* Enter guest mode */
1725 "push %%eax \n\t" 1781 "push %%eax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1782 "mov %c[vmcb](%[svm]), %%eax \n\t"
1727 SVM_VMLOAD "\n\t" 1783 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1784 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1785 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1786 "pop %%eax \n\t"
1731#endif 1787#endif
1732 1788
@@ -1795,9 +1851,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1795 write_dr7(svm->host_dr7); 1851 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1852 kvm_write_cr2(svm->host_cr2);
1797 1853
1798 load_fs(fs_selector); 1854 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1855 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1856 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1857 load_host_msrs(vcpu);
1802 1858
1803 reload_tss(vcpu); 1859 reload_tss(vcpu);
@@ -1889,7 +1945,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1945 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1946 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1947 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1948
1894 .set_guest_debug = svm_guest_debug, 1949 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1950 .get_msr = svm_get_msr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10ce6ee4c491..2a69773e3b26 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -30,6 +30,8 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33#define __ex(x) __kvm_handle_fault_on_reboot(x)
34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 36MODULE_LICENSE("GPL");
35 37
@@ -53,6 +55,7 @@ struct vmcs {
53 55
54struct vcpu_vmx { 56struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 57 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link;
56 int launched; 59 int launched;
57 u8 fail; 60 u8 fail;
58 u32 idt_vectoring_info; 61 u32 idt_vectoring_info;
@@ -88,9 +91,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 91}
89 92
90static int init_rmode(struct kvm *kvm); 93static int init_rmode(struct kvm *kvm);
94static u64 construct_eptp(unsigned long root_hpa);
91 95
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 96static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 97static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
98static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 99
95static struct page *vmx_io_bitmap_a; 100static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 101static struct page *vmx_io_bitmap_b;
@@ -260,6 +265,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 265 SECONDARY_EXEC_ENABLE_VPID);
261} 266}
262 267
268static inline int cpu_has_virtual_nmis(void)
269{
270 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
271}
272
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 273static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 274{
265 int i; 275 int i;
@@ -278,7 +288,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 288 u64 gva;
279 } operand = { vpid, 0, gva }; 289 } operand = { vpid, 0, gva };
280 290
281 asm volatile (ASM_VMX_INVVPID 291 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 292 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 293 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 294 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +300,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 300 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 301 } operand = {eptp, gpa};
292 302
293 asm volatile (ASM_VMX_INVEPT 303 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 304 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 305 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 306 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +321,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 321 u64 phys_addr = __pa(vmcs);
312 u8 error; 322 u8 error;
313 323
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 324 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 325 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 326 : "cc", "memory");
317 if (error) 327 if (error)
@@ -329,6 +339,9 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 339 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 340 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 341 rdtscll(vmx->vcpu.arch.host_tsc);
342 list_del(&vmx->local_vcpus_link);
343 vmx->vcpu.cpu = -1;
344 vmx->launched = 0;
332} 345}
333 346
334static void vcpu_clear(struct vcpu_vmx *vmx) 347static void vcpu_clear(struct vcpu_vmx *vmx)
@@ -336,7 +349,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
336 if (vmx->vcpu.cpu == -1) 349 if (vmx->vcpu.cpu == -1)
337 return; 350 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 351 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 352}
341 353
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 354static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +390,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 390{
379 unsigned long value; 391 unsigned long value;
380 392
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 393 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 394 : "=a"(value) : "d"(field) : "cc");
383 return value; 395 return value;
384} 396}
@@ -413,7 +425,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 425{
414 u8 error; 426 u8 error;
415 427
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 428 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 429 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 430 if (unlikely(error))
419 vmwrite_error(field, value); 431 vmwrite_error(field, value);
@@ -431,10 +443,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 443
432static void vmcs_write64(unsigned long field, u64 value) 444static void vmcs_write64(unsigned long field, u64 value)
433{ 445{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 446 vmcs_writel(field, value);
447#ifndef CONFIG_X86_64
438 asm volatile (""); 448 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 449 vmcs_writel(field+1, value >> 32);
440#endif 450#endif
@@ -474,7 +484,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 484 struct descriptor_table gdt;
475 struct desc_struct *descs; 485 struct desc_struct *descs;
476 486
477 get_gdt(&gdt); 487 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 488 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 489 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 490 load_TR_desc();
@@ -530,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 540 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 541 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 542 */
533 vmx->host_state.ldt_sel = read_ldt(); 543 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 544 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 545 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 546 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 547 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 548 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 550 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 551 vmx->host_state.fs_reload_needed = 1;
542 } 552 }
543 vmx->host_state.gs_sel = read_gs(); 553 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 554 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 555 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 556 else {
@@ -576,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 586 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 587 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 588 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 589 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 590 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 591 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 592 /*
583 * If we have to reload gs, we must take care to 593 * If we have to reload gs, we must take care to
584 * preserve our gs base. 594 * preserve our gs base.
585 */ 595 */
586 local_irq_save(flags); 596 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 597 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 598#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 599 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 600#endif
@@ -617,13 +627,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 627 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 628 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 629 vpid_sync_vcpu_all(vmx);
630 local_irq_disable();
631 list_add(&vmx->local_vcpus_link,
632 &per_cpu(vcpus_on_cpu, cpu));
633 local_irq_enable();
620 } 634 }
621 635
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 636 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 637 u8 error;
624 638
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 639 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 640 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 641 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 642 : "cc");
629 if (error) 643 if (error)
@@ -640,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 654 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 655 * processors.
642 */ 656 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 657 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 658 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 659 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 660
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 661 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +698,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 698 update_exception_bitmap(vcpu);
685} 699}
686 700
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 701static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 702{
694 return vmcs_readl(GUEST_RFLAGS); 703 return vmcs_readl(GUEST_RFLAGS);
@@ -913,6 +922,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 922 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 923 guest_write_tsc(data);
915 break; 924 break;
925 case MSR_P6_PERFCTR0:
926 case MSR_P6_PERFCTR1:
927 case MSR_P6_EVNTSEL0:
928 case MSR_P6_EVNTSEL1:
929 /*
930 * Just discard all writes to the performance counters; this
931 * should keep both older linux and windows 64-bit guests
932 * happy
933 */
934 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
935
936 break;
916 default: 937 default:
917 vmx_load_host_state(vmx); 938 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 939 msr = find_msr_entry(vmx, msr_index);
@@ -1022,6 +1043,7 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1043 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1044 u64 old;
1024 1045
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
@@ -1032,13 +1054,25 @@ static void hardware_enable(void *garbage)
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1054 MSR_IA32_FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1057 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1059 : "memory", "cc");
1037} 1060}
1038 1061
1062static void vmclear_local_vcpus(void)
1063{
1064 int cpu = raw_smp_processor_id();
1065 struct vcpu_vmx *vmx, *n;
1066
1067 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1068 local_vcpus_link)
1069 __vcpu_clear(vmx);
1070}
1071
1039static void hardware_disable(void *garbage) 1072static void hardware_disable(void *garbage)
1040{ 1073{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1074 vmclear_local_vcpus();
1075 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1076 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1077}
1044 1078
@@ -1072,7 +1106,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1106 u32 _vmentry_control = 0;
1073 1107
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1108 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1109 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1110 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1111 &_pin_based_exec_control) < 0)
1078 return -EIO; 1112 return -EIO;
@@ -1389,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1423static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1424{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1425 vpid_sync_vcpu_all(to_vmx(vcpu));
1426 if (vm_need_ept())
1427 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1428}
1393 1429
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1430static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1456,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1456 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1457 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1458 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1459 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1460 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1461 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1462 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1466,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1466 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1467 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1468 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1469 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1470 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1471 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1472 vcpu->arch.cr0 = cr0;
@@ -1821,7 +1857,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 1857 spin_unlock(&vmx_vpid_lock);
1822} 1858}
1823 1859
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 1860static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 1861{
1826 void *va; 1862 void *va;
1827 1863
@@ -1907,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1943 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1944 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1945 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1946 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1947 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1948 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 1949#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 1950 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 1958
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1959 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 1960
1925 get_idt(&dt); 1961 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1962 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 1963
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1964 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -2114,6 +2150,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2150 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2151}
2116 2152
2153static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158}
2159
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2161{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2162 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2255,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2255 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2298 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2256 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2257 (u32)((u64)cr2 >> 32), handler); 2300 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2258 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2303 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2259 } 2304 }
2260 2305
@@ -2554,8 +2599,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2599 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2600 offset = exit_qualification & 0xffful;
2556 2601
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2602 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2603
2561 if (er != EMULATE_DONE) { 2604 if (er != EMULATE_DONE) {
@@ -2639,6 +2682,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2682 return 1;
2640} 2683}
2641 2684
2685static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2686{
2687 u32 cpu_based_vm_exec_control;
2688
2689 /* clear pending NMI */
2690 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2691 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2692 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2693 ++vcpu->stat.nmi_window_exits;
2694
2695 return 1;
2696}
2697
2642/* 2698/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2699 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2705,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2708 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2709 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2710 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2711 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2736,17 +2793,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2793 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 2794}
2738 2795
2796static void enable_nmi_window(struct kvm_vcpu *vcpu)
2797{
2798 u32 cpu_based_vm_exec_control;
2799
2800 if (!cpu_has_virtual_nmis())
2801 return;
2802
2803 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2804 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2805 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2806}
2807
2808static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
2809{
2810 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2811 return !(guest_intr & (GUEST_INTR_STATE_NMI |
2812 GUEST_INTR_STATE_MOV_SS |
2813 GUEST_INTR_STATE_STI));
2814}
2815
2816static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2817{
2818 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2819 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
2820 GUEST_INTR_STATE_STI)) &&
2821 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
2822}
2823
2824static void enable_intr_window(struct kvm_vcpu *vcpu)
2825{
2826 if (vcpu->arch.nmi_pending)
2827 enable_nmi_window(vcpu);
2828 else if (kvm_cpu_has_interrupt(vcpu))
2829 enable_irq_window(vcpu);
2830}
2831
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2832static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2740{ 2833{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2742 u32 idtv_info_field, intr_info_field; 2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector; 2836 int vector;
2745 2837
2746 update_tpr_threshold(vcpu); 2838 update_tpr_threshold(vcpu);
2747 2839
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
2750 idtv_info_field = vmx->idt_vectoring_info; 2842 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) { 2843 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2844 if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2754,8 +2846,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2754 if (printk_ratelimit()) 2846 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2847 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 } 2848 }
2757 if (has_ext_irq) 2849 enable_intr_window(vcpu);
2758 enable_irq_window(vcpu);
2759 return; 2850 return;
2760 } 2851 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
@@ -2765,30 +2856,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; 2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 2857
2767 vmx_inject_irq(vcpu, vect); 2858 vmx_inject_irq(vcpu, vect);
2768 if (unlikely(has_ext_irq)) 2859 enable_intr_window(vcpu);
2769 enable_irq_window(vcpu);
2770 return; 2860 return;
2771 } 2861 }
2772 2862
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2774 2864
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2865 /*
2866 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted.
2869 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2874 ~GUEST_INTR_STATE_NMI);
2875
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
2877 & ~INTR_INFO_RESVD_BITS_MASK);
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2778 2880
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2883 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2782 if (unlikely(has_ext_irq)) 2884 enable_intr_window(vcpu);
2783 enable_irq_window(vcpu);
2784 return; 2885 return;
2785 } 2886 }
2786 if (!has_ext_irq) 2887 if (cpu_has_virtual_nmis()) {
2888 /*
2889 * SDM 3: 25.7.1.2
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2891 * a guest IRET fault.
2892 */
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
2897 GUEST_INTR_STATE_NMI);
2898 else if (vcpu->arch.nmi_pending) {
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu);
2902 return;
2903 }
2904
2905 }
2906 if (!kvm_cpu_has_interrupt(vcpu))
2787 return; 2907 return;
2788 interrupt_window_open = 2908 if (vmx_irq_enabled(vcpu)) {
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu); 2909 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector); 2910 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector); 2911 kvm_timer_intr_post(vcpu, vector);
@@ -2838,7 +2955,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 "push %%edx; push %%ebp;" 2955 "push %%edx; push %%ebp;"
2839 "push %%ecx \n\t" 2956 "push %%ecx \n\t"
2840#endif 2957#endif
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 2959 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 2960 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 2961 /* Load guest registers. Don't clobber flags. */
@@ -2873,9 +2990,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2873#endif 2990#endif
2874 /* Enter guest mode */ 2991 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 2992 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 2993 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 2994 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 2996 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 2997 /* Save guest registers, load host registers, keep flags */
2881#ifdef CONFIG_X86_64 2998#ifdef CONFIG_X86_64
@@ -2949,7 +3066,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2949 fixup_rmode_irq(vmx); 3066 fixup_rmode_irq(vmx);
2950 3067
2951 vcpu->arch.interrupt_window_open = 3068 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3069 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3070 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3071
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3072 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3073 vmx->launched = 1;
@@ -2957,7 +3075,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3075 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3076
2959 /* We need to handle NMIs before interrupts are enabled */ 3077 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3078 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3079 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3080 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3081 asm("int $2");
2963 } 3082 }
@@ -2968,7 +3087,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3087 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3088
2970 if (vmx->vmcs) { 3089 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 1); 3090 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3091 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3092 vmx->vmcs = NULL;
2974 } 3093 }
@@ -2999,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2999 return ERR_PTR(-ENOMEM); 3118 return ERR_PTR(-ENOMEM);
3000 3119
3001 allocate_vpid(vmx); 3120 allocate_vpid(vmx);
3002 if (id == 0 && vm_need_ept()) {
3003 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3004 VMX_EPT_WRITABLE_MASK |
3005 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3006 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3007 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3008 VMX_EPT_EXECUTABLE_MASK);
3009 kvm_enable_tdp();
3010 }
3011 3121
3012 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3122 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3013 if (err) 3123 if (err)
@@ -3095,7 +3205,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3205 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3206 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3207 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3208
3100 .set_guest_debug = set_guest_debug, 3209 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3210 .guest_debug_pre = kvm_guest_debug_pre,
@@ -3187,8 +3296,17 @@ static int __init vmx_init(void)
3187 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3296 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3188 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3297 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3189 3298
3190 if (cpu_has_vmx_ept()) 3299 if (vm_need_ept()) {
3191 bypass_guest_pf = 0; 3300 bypass_guest_pf = 0;
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3305 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3306 VMX_EPT_EXECUTABLE_MASK);
3307 kvm_enable_tdp();
3308 } else
3309 kvm_disable_tdp();
3192 3310
3193 if (bypass_guest_pf) 3311 if (bypass_guest_pf)
3194 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3312 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610dfe..425a13436b3f 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0faa2546b1cd..5916191420c7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
75 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -173,6 +174,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 175}
175 176
177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
178{
179 vcpu->arch.nmi_pending = 1;
180}
181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
182
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 184{
178 WARN_ON(vcpu->arch.exception.pending); 185 WARN_ON(vcpu->arch.exception.pending);
@@ -604,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 612}
606 613
614static bool msr_mtrr_valid(unsigned msr)
615{
616 switch (msr) {
617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
618 case MSR_MTRRfix64K_00000:
619 case MSR_MTRRfix16K_80000:
620 case MSR_MTRRfix16K_A0000:
621 case MSR_MTRRfix4K_C0000:
622 case MSR_MTRRfix4K_C8000:
623 case MSR_MTRRfix4K_D0000:
624 case MSR_MTRRfix4K_D8000:
625 case MSR_MTRRfix4K_E0000:
626 case MSR_MTRRfix4K_E8000:
627 case MSR_MTRRfix4K_F0000:
628 case MSR_MTRRfix4K_F8000:
629 case MSR_MTRRdefType:
630 case MSR_IA32_CR_PAT:
631 return true;
632 case 0x2f8:
633 return true;
634 }
635 return false;
636}
637
638static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
639{
640 if (!msr_mtrr_valid(msr))
641 return 1;
642
643 vcpu->arch.mtrr[msr - 0x200] = data;
644 return 0;
645}
607 646
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 647int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 648{
@@ -625,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
625 break; 664 break;
626 case MSR_IA32_UCODE_REV: 665 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 666 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 667 break;
668 case 0x200 ... 0x2ff:
669 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 670 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 671 kvm_set_apic_base(vcpu, data);
632 break; 672 break;
@@ -684,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 725}
686 726
727static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
728{
729 if (!msr_mtrr_valid(msr))
730 return 1;
731
732 *pdata = vcpu->arch.mtrr[msr - 0x200];
733 return 0;
734}
735
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 736int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 737{
689 u64 data; 738 u64 data;
@@ -705,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
705 case MSR_IA32_MC0_MISC+16: 754 case MSR_IA32_MC0_MISC+16:
706 case MSR_IA32_UCODE_REV: 755 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 756 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */
709 case 0xfe:
710 case 0x200 ... 0x2ff:
711 data = 0; 757 data = 0;
712 break; 758 break;
759 case MSR_MTRRcap:
760 data = 0x500 | KVM_NR_VAR_MTRR;
761 break;
762 case 0x200 ... 0x2ff:
763 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 764 case 0xcd: /* fsb frequency */
714 data = 3; 765 data = 3;
715 break; 766 break;
@@ -817,41 +868,6 @@ out:
817 return r; 868 return r;
818} 869}
819 870
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 871int kvm_dev_ioctl_check_extension(long ext)
856{ 872{
857 int r; 873 int r;
@@ -869,6 +885,9 @@ int kvm_dev_ioctl_check_extension(long ext)
869 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
870 r = 1; 886 r = 1;
871 break; 887 break;
888 case KVM_CAP_COALESCED_MMIO:
889 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
890 break;
872 case KVM_CAP_VAPIC: 891 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 892 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 893 break;
@@ -1781,13 +1800,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1800 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1801 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1802static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1803 gpa_t addr, int len,
1804 int is_write)
1785{ 1805{
1786 struct kvm_io_device *dev; 1806 struct kvm_io_device *dev;
1787 1807
1788 if (vcpu->arch.apic) { 1808 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1809 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1810 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1811 return dev;
1792 } 1812 }
1793 return NULL; 1813 return NULL;
@@ -1795,13 +1815,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1815
1796 1816
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1817static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1818 gpa_t addr, int len,
1819 int is_write)
1799{ 1820{
1800 struct kvm_io_device *dev; 1821 struct kvm_io_device *dev;
1801 1822
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1823 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1824 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1825 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1826 is_write);
1805 return dev; 1827 return dev;
1806} 1828}
1807 1829
@@ -1869,7 +1891,7 @@ mmio:
1869 * Is this MMIO handled locally? 1891 * Is this MMIO handled locally?
1870 */ 1892 */
1871 mutex_lock(&vcpu->kvm->lock); 1893 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1894 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1895 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1896 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1897 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1946,7 @@ mmio:
1924 * Is this MMIO handled locally? 1946 * Is this MMIO handled locally?
1925 */ 1947 */
1926 mutex_lock(&vcpu->kvm->lock); 1948 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1949 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 1950 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1951 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 1952 mutex_unlock(&vcpu->kvm->lock);
@@ -2020,6 +2042,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2020 2042
2021int emulate_clts(struct kvm_vcpu *vcpu) 2043int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2044{
2045 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2046 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2047 return X86EMUL_CONTINUE;
2025} 2048}
@@ -2053,21 +2076,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2076
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2077void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2078{
2056 static int reported;
2057 u8 opcodes[4]; 2079 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2080 unsigned long rip = vcpu->arch.rip;
2059 unsigned long rip_linear; 2081 unsigned long rip_linear;
2060 2082
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2083 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2084 return;
2065 2085
2086 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2087
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2088 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2089
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2090 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2091 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2092}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2093EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2094
@@ -2105,27 +2126,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2126 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2127 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2128
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2130
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2131 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2300,9 +2300,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2300}
2301 2301
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2303 gpa_t addr, int len,
2304 int is_write)
2304{ 2305{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2306 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2307}
2307 2308
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2309int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2331,11 +2332,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2331 2332
2332 kvm_x86_ops->cache_regs(vcpu); 2333 kvm_x86_ops->cache_regs(vcpu);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2334 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2335
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2336 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2337
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2338 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2339 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2341 complete_pio(vcpu);
@@ -2417,7 +2417,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2417 }
2418 } 2418 }
2419 2419
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2420 pio_dev = vcpu_find_pio_dev(vcpu, port,
2421 vcpu->arch.pio.cur_count,
2422 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2423 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2424 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2425 ret = pio_copy_data(vcpu);
@@ -2600,27 +2602,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2602
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2603unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2604{
2605 unsigned long value;
2606
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2607 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2608 switch (cr) {
2605 case 0: 2609 case 0:
2606 return vcpu->arch.cr0; 2610 value = vcpu->arch.cr0;
2611 break;
2607 case 2: 2612 case 2:
2608 return vcpu->arch.cr2; 2613 value = vcpu->arch.cr2;
2614 break;
2609 case 3: 2615 case 3:
2610 return vcpu->arch.cr3; 2616 value = vcpu->arch.cr3;
2617 break;
2611 case 4: 2618 case 4:
2612 return vcpu->arch.cr4; 2619 value = vcpu->arch.cr4;
2620 break;
2613 case 8: 2621 case 8:
2614 return kvm_get_cr8(vcpu); 2622 value = kvm_get_cr8(vcpu);
2623 break;
2615 default: 2624 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2625 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2626 return 0;
2618 } 2627 }
2628 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2629 (u32)((u64)value >> 32), handler);
2630
2631 return value;
2619} 2632}
2620 2633
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2634void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2635 unsigned long *rflags)
2623{ 2636{
2637 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2638 (u32)((u64)val >> 32), handler);
2639
2624 switch (cr) { 2640 switch (cr) {
2625 case 0: 2641 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2642 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2771,8 +2787,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2787 if (!apic || !apic->vapic_addr)
2772 return; 2788 return;
2773 2789
2790 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2791 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2792 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2793 up_read(&vcpu->kvm->slots_lock);
2776} 2794}
2777 2795
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2796static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2928,9 +2946,7 @@ out:
2928 2946
2929 post_kvm_run_save(vcpu, kvm_run); 2947 post_kvm_run_save(vcpu, kvm_run);
2930 2948
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 2949 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 2950
2935 return r; 2951 return r;
2936} 2952}
@@ -2942,15 +2958,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 2958
2943 vcpu_load(vcpu); 2959 vcpu_load(vcpu);
2944 2960
2961 if (vcpu->sigset_active)
2962 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2963
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2964 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 2965 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 2966 r = -EAGAIN;
2948 return -EAGAIN; 2967 goto out;
2949 } 2968 }
2950 2969
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 2970 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 2971 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 2972 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -3070,8 +3086,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3086 return 0;
3071} 3087}
3072 3088
3073static void get_segment(struct kvm_vcpu *vcpu, 3089void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3090 struct kvm_segment *var, int seg)
3075{ 3091{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3092 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3093}
@@ -3080,7 +3096,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3096{
3081 struct kvm_segment cs; 3097 struct kvm_segment cs;
3082 3098
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3099 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3100 *db = cs.db;
3085 *l = cs.l; 3101 *l = cs.l;
3086} 3102}
@@ -3094,15 +3110,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3110
3095 vcpu_load(vcpu); 3111 vcpu_load(vcpu);
3096 3112
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3113 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3114 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3115 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3116 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3117 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3118 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3119
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3120 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3121 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3122
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3123 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3124 sregs->idt.limit = dt.limit;
@@ -3154,7 +3170,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3170 return 0;
3155} 3171}
3156 3172
3157static void set_segment(struct kvm_vcpu *vcpu, 3173static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3174 struct kvm_segment *var, int seg)
3159{ 3175{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3176 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3168,6 +3184,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3168 kvm_desct->base |= seg_desc->base2 << 24; 3184 kvm_desct->base |= seg_desc->base2 << 24;
3169 kvm_desct->limit = seg_desc->limit0; 3185 kvm_desct->limit = seg_desc->limit0;
3170 kvm_desct->limit |= seg_desc->limit << 16; 3186 kvm_desct->limit |= seg_desc->limit << 16;
3187 if (seg_desc->g) {
3188 kvm_desct->limit <<= 12;
3189 kvm_desct->limit |= 0xfff;
3190 }
3171 kvm_desct->selector = selector; 3191 kvm_desct->selector = selector;
3172 kvm_desct->type = seg_desc->type; 3192 kvm_desct->type = seg_desc->type;
3173 kvm_desct->present = seg_desc->p; 3193 kvm_desct->present = seg_desc->p;
@@ -3191,7 +3211,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3211 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3212 struct kvm_segment kvm_seg;
3193 3213
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3214 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3215
3196 if (kvm_seg.unusable) 3216 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3217 dtable->limit = 0;
@@ -3207,6 +3227,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3207static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3227static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3208 struct desc_struct *seg_desc) 3228 struct desc_struct *seg_desc)
3209{ 3229{
3230 gpa_t gpa;
3210 struct descriptor_table dtable; 3231 struct descriptor_table dtable;
3211 u16 index = selector >> 3; 3232 u16 index = selector >> 3;
3212 3233
@@ -3216,13 +3237,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3216 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3237 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3217 return 1; 3238 return 1;
3218 } 3239 }
3219 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3240 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3241 gpa += index * 8;
3242 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3220} 3243}
3221 3244
3222/* allowed just for 8 bytes segments */ 3245/* allowed just for 8 bytes segments */
3223static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3246static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3247 struct desc_struct *seg_desc)
3225{ 3248{
3249 gpa_t gpa;
3226 struct descriptor_table dtable; 3250 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3251 u16 index = selector >> 3;
3228 3252
@@ -3230,7 +3254,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3230 3254
3231 if (dtable.limit < index * 8 + 7) 3255 if (dtable.limit < index * 8 + 7)
3232 return 1; 3256 return 1;
3233 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3257 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3258 gpa += index * 8;
3259 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3234} 3260}
3235 3261
3236static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3262static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3242,62 +3268,14 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3242 base_addr |= (seg_desc->base1 << 16); 3268 base_addr |= (seg_desc->base1 << 16);
3243 base_addr |= (seg_desc->base2 << 24); 3269 base_addr |= (seg_desc->base2 << 24);
3244 3270
3245 return base_addr; 3271 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3246}
3247
3248static int load_tss_segment32(struct kvm_vcpu *vcpu,
3249 struct desc_struct *seg_desc,
3250 struct tss_segment_32 *tss)
3251{
3252 u32 base_addr;
3253
3254 base_addr = get_tss_base_addr(vcpu, seg_desc);
3255
3256 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3257 sizeof(struct tss_segment_32));
3258}
3259
3260static int save_tss_segment32(struct kvm_vcpu *vcpu,
3261 struct desc_struct *seg_desc,
3262 struct tss_segment_32 *tss)
3263{
3264 u32 base_addr;
3265
3266 base_addr = get_tss_base_addr(vcpu, seg_desc);
3267
3268 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3269 sizeof(struct tss_segment_32));
3270}
3271
3272static int load_tss_segment16(struct kvm_vcpu *vcpu,
3273 struct desc_struct *seg_desc,
3274 struct tss_segment_16 *tss)
3275{
3276 u32 base_addr;
3277
3278 base_addr = get_tss_base_addr(vcpu, seg_desc);
3279
3280 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3281 sizeof(struct tss_segment_16));
3282}
3283
3284static int save_tss_segment16(struct kvm_vcpu *vcpu,
3285 struct desc_struct *seg_desc,
3286 struct tss_segment_16 *tss)
3287{
3288 u32 base_addr;
3289
3290 base_addr = get_tss_base_addr(vcpu, seg_desc);
3291
3292 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3293 sizeof(struct tss_segment_16));
3294} 3272}
3295 3273
3296static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3274static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3275{
3298 struct kvm_segment kvm_seg; 3276 struct kvm_segment kvm_seg;
3299 3277
3300 get_segment(vcpu, &kvm_seg, seg); 3278 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3279 return kvm_seg.selector;
3302} 3280}
3303 3281
@@ -3313,8 +3291,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3291 return 0;
3314} 3292}
3315 3293
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3294int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3317 int type_bits, int seg) 3295 int type_bits, int seg)
3318{ 3296{
3319 struct kvm_segment kvm_seg; 3297 struct kvm_segment kvm_seg;
3320 3298
@@ -3327,7 +3305,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3305 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3306 kvm_seg.unusable = 1;
3329 3307
3330 set_segment(vcpu, &kvm_seg, seg); 3308 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3309 return 0;
3332} 3310}
3333 3311
@@ -3373,25 +3351,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3351 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3352 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3375 3353
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3354 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3355 return 1;
3378 3356
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3357 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3358 return 1;
3381 3359
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3360 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3361 return 1;
3384 3362
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3363 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3364 return 1;
3387 3365
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3366 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3367 return 1;
3390 3368
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3369 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3370 return 1;
3393 3371
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3372 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3373 return 1;
3396 return 0; 3374 return 0;
3397} 3375}
@@ -3432,38 +3410,44 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3410 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3411 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3434 3412
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3413 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3414 return 1;
3437 3415
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3416 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3417 return 1;
3440 3418
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3419 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3420 return 1;
3443 3421
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3422 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3423 return 1;
3446 3424
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3425 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3426 return 1;
3449 return 0; 3427 return 0;
3450} 3428}
3451 3429
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3430static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3431 u32 old_tss_base,
3454 struct desc_struct *nseg_desc) 3432 struct desc_struct *nseg_desc)
3455{ 3433{
3456 struct tss_segment_16 tss_segment_16; 3434 struct tss_segment_16 tss_segment_16;
3457 int ret = 0; 3435 int ret = 0;
3458 3436
3459 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3437 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3438 sizeof tss_segment_16))
3460 goto out; 3439 goto out;
3461 3440
3462 save_state_to_tss16(vcpu, &tss_segment_16); 3441 save_state_to_tss16(vcpu, &tss_segment_16);
3463 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3464 3442
3465 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3443 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3444 sizeof tss_segment_16))
3466 goto out; 3445 goto out;
3446
3447 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3448 &tss_segment_16, sizeof tss_segment_16))
3449 goto out;
3450
3467 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3451 if (load_state_from_tss16(vcpu, &tss_segment_16))
3468 goto out; 3452 goto out;
3469 3453
@@ -3472,21 +3456,27 @@ out:
3472 return ret; 3456 return ret;
3473} 3457}
3474 3458
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3459static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3460 u32 old_tss_base,
3477 struct desc_struct *nseg_desc) 3461 struct desc_struct *nseg_desc)
3478{ 3462{
3479 struct tss_segment_32 tss_segment_32; 3463 struct tss_segment_32 tss_segment_32;
3480 int ret = 0; 3464 int ret = 0;
3481 3465
3482 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3466 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3467 sizeof tss_segment_32))
3483 goto out; 3468 goto out;
3484 3469
3485 save_state_to_tss32(vcpu, &tss_segment_32); 3470 save_state_to_tss32(vcpu, &tss_segment_32);
3486 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3487 3471
3488 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3472 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3473 sizeof tss_segment_32))
3489 goto out; 3474 goto out;
3475
3476 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3477 &tss_segment_32, sizeof tss_segment_32))
3478 goto out;
3479
3490 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3480 if (load_state_from_tss32(vcpu, &tss_segment_32))
3491 goto out; 3481 goto out;
3492 3482
@@ -3501,16 +3491,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3501 struct desc_struct cseg_desc; 3491 struct desc_struct cseg_desc;
3502 struct desc_struct nseg_desc; 3492 struct desc_struct nseg_desc;
3503 int ret = 0; 3493 int ret = 0;
3494 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3495 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3504 3496
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3497 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3506 3498
3499 /* FIXME: Handle errors. Failure to read either TSS or their
3500 * descriptors should generate a pagefault.
3501 */
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3502 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3503 goto out;
3509 3504
3510 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3505 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3511 goto out; 3506 goto out;
3512 3507
3513
3514 if (reason != TASK_SWITCH_IRET) { 3508 if (reason != TASK_SWITCH_IRET) {
3515 int cpl; 3509 int cpl;
3516 3510
@@ -3528,8 +3522,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3528 3522
3529 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3523 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3530 cseg_desc.type &= ~(1 << 1); //clear the B flag 3524 cseg_desc.type &= ~(1 << 1); //clear the B flag
3531 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3525 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3532 &cseg_desc);
3533 } 3526 }
3534 3527
3535 if (reason == TASK_SWITCH_IRET) { 3528 if (reason == TASK_SWITCH_IRET) {
@@ -3541,10 +3534,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3541 kvm_x86_ops->cache_regs(vcpu); 3534 kvm_x86_ops->cache_regs(vcpu);
3542 3535
3543 if (nseg_desc.type & 8) 3536 if (nseg_desc.type & 8)
3544 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3537 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3545 &nseg_desc); 3538 &nseg_desc);
3546 else 3539 else
3547 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3540 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3548 &nseg_desc); 3541 &nseg_desc);
3549 3542
3550 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3543 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3561,7 +3554,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3554 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3555 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3556 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3557 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3558out:
3566 kvm_x86_ops->decache_regs(vcpu); 3559 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3560 return ret;
@@ -3628,15 +3621,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3628 } 3621 }
3629 } 3622 }
3630 3623
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3625 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3626 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3627 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3628 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3629 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3637 3630
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3631 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3632 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3640 3633
3641 vcpu_put(vcpu); 3634 vcpu_put(vcpu);
3642 3635
@@ -3751,14 +3744,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3744 * allocate ram with GFP_KERNEL.
3752 */ 3745 */
3753 if (!used_math()) 3746 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3747 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3748
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3749 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3750 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3751 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3752 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3753 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3754 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3755 preempt_enable();
3763 3756
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3757 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3768,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3768 return;
3776 3769
3777 vcpu->guest_fpu_loaded = 1; 3770 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3771 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3772 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3773}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3774EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3775
@@ -3786,8 +3779,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3779 return;
3787 3780
3788 vcpu->guest_fpu_loaded = 0; 3781 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3782 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3783 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3784 ++vcpu->stat.fpu_reload;
3792} 3785}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3786EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -4016,6 +4009,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4009 return 0;
4017} 4010}
4018 4011
4012void kvm_arch_flush_shadow(struct kvm *kvm)
4013{
4014 kvm_mmu_zap_all(kvm);
4015}
4016
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4017int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4018{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4019 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890c..f2f90468f8b1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -121,7 +121,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 122 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 123 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 124 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 127 /* 0x70 - 0x77 */
@@ -138,9 +138,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 141 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 142 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 143 /* 0x90 - 0x97 */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 146 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 147 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 148 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +154,8 @@ static u16 opcode_table[256] = {
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 155 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 156 /* 0xB0 - 0xBF */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xC0 - 0xC7 */ 159 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 161 0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +171,8 @@ static u16 opcode_table[256] = {
168 /* 0xE0 - 0xE7 */ 171 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */ 173 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 174 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps,
172 0, 0, 0, 0, 176 0, 0, 0, 0,
173 /* 0xF0 - 0xF7 */ 177 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 178 0, 0, 0, 0,
@@ -215,7 +219,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 219 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 220 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 221 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 222 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 223 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 224 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 225 DstMem | SrcReg | ModRM | BitOp,
@@ -518,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 522 register_address_increment(c, &c->eip, rel);
519} 523}
520 524
525static void set_seg_override(struct decode_cache *c, int seg)
526{
527 c->has_seg_override = true;
528 c->seg_override = seg;
529}
530
531static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
532{
533 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
534 return 0;
535
536 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
537}
538
539static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
540 struct decode_cache *c)
541{
542 if (!c->has_seg_override)
543 return 0;
544
545 return seg_base(ctxt, c->seg_override);
546}
547
548static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
549{
550 return seg_base(ctxt, VCPU_SREG_ES);
551}
552
553static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
554{
555 return seg_base(ctxt, VCPU_SREG_SS);
556}
557
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 558static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 559 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 560 unsigned long linear, u8 *dest)
@@ -660,7 +697,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 697{
661 struct decode_cache *c = &ctxt->decode; 698 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 699 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 700 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 701 int rc = 0;
665 702
666 if (c->rex_prefix) { 703 if (c->rex_prefix) {
@@ -731,47 +768,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 768 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 769 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 770 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 771 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 772 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 773 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 774 } else {
738 /* 32/64-bit ModR/M decode. */ 775 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 776 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 777 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 778 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 779 base_reg |= sib & 7;
745 scale = sib >> 6; 780 scale = sib >> 6;
746 781
747 switch (base_reg) { 782 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 783 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 784 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 785 c->modrm_ea += c->regs[base_reg];
757 } 786 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 787 c->modrm_ea += c->regs[index_reg] << scale;
763 } 788 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 789 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 790 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 791 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 792 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 793 switch (c->modrm_mod) {
776 case 0: 794 case 0:
777 if (c->modrm_rm == 5) 795 if (c->modrm_rm == 5)
@@ -785,22 +803,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 803 break;
786 } 804 }
787 } 805 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 806done:
805 return rc; 807 return rc;
806} 808}
@@ -838,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
838 840
839 memset(c, 0, sizeof(struct decode_cache)); 841 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 842 c->eip = ctxt->vcpu->arch.rip;
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 845
843 switch (mode) { 846 switch (mode) {
@@ -876,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 879 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 880 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 881 break;
882 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 883 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 884 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 885 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 886 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 887 break;
888 case 0x64: /* FS override */ 888 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 889 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 890 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 891 break;
897 case 0x40 ... 0x4f: /* REX */ 892 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 893 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +959,11 @@ done_prefixes:
964 if (rc) 959 if (rc)
965 goto done; 960 goto done;
966 961
967 if (!c->override_base) 962 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 963 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 964
974 if (c->override_base) 965 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 966 c->modrm_ea += seg_override_base(ctxt, c);
976 967
977 if (c->ad_bytes != 8) 968 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 969 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1040,7 @@ done_prefixes:
1049 break; 1040 break;
1050 case DstMem: 1041 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1042 if ((c->d & ModRM) && c->modrm_mod == 3) {
1043 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1044 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1045 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1046 c->dst.ptr = c->modrm_ptr;
@@ -1058,6 +1050,9 @@ done_prefixes:
1058 break; 1050 break;
1059 } 1051 }
1060 1052
1053 if (c->rip_relative)
1054 c->modrm_ea += c->eip;
1055
1061done: 1056done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1057 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1058}
@@ -1070,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1065 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1066 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1067 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1068 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1069 c->regs[VCPU_REGS_RSP]);
1075} 1070}
1076 1071
@@ -1080,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1075 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1076 int rc;
1082 1077
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1078 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1079 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1080 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1081 if (rc != 0)
@@ -1402,11 +1397,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1398 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1399 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1400 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1401 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1402 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1403 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1404 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1405 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1406 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1407 goto done;
@@ -1420,9 +1415,8 @@ special_insn:
1420 goto cannot_emulate; 1415 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1416 c->dst.val = (s32) c->src.val;
1422 break; 1417 break;
1418 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1419 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1420 emulate_push(ctxt);
1427 break; 1421 break;
1428 case 0x6c: /* insb */ 1422 case 0x6c: /* insb */
@@ -1433,7 +1427,7 @@ special_insn:
1433 c->rep_prefix ? 1427 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1428 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1429 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1430 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1431 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1432 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1433 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1443,8 @@ special_insn:
1449 c->rep_prefix ? 1443 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1444 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1445 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1446 register_address(c,
1453 *c->override_base : 1447 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1448 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1449 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1450 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1483,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1483 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1484 break;
1492 case 0x86 ... 0x87: /* xchg */ 1485 case 0x86 ... 0x87: /* xchg */
1486 xchg:
1493 /* Write back the register source. */ 1487 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1488 switch (c->dst.bytes) {
1495 case 1: 1489 case 1:
@@ -1514,14 +1508,60 @@ special_insn:
1514 break; 1508 break;
1515 case 0x88 ... 0x8b: /* mov */ 1509 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1510 goto mov;
1511 case 0x8c: { /* mov r/m, sreg */
1512 struct kvm_segment segreg;
1513
1514 if (c->modrm_reg <= 5)
1515 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1516 else {
1517 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1518 c->modrm);
1519 goto cannot_emulate;
1520 }
1521 c->dst.val = segreg.selector;
1522 break;
1523 }
1517 case 0x8d: /* lea r16/r32, m */ 1524 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1525 c->dst.val = c->modrm_ea;
1519 break; 1526 break;
1527 case 0x8e: { /* mov seg, r/m16 */
1528 uint16_t sel;
1529 int type_bits;
1530 int err;
1531
1532 sel = c->src.val;
1533 if (c->modrm_reg <= 5) {
1534 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1535 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1536 type_bits, c->modrm_reg);
1537 } else {
1538 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1539 c->modrm);
1540 goto cannot_emulate;
1541 }
1542
1543 if (err < 0)
1544 goto cannot_emulate;
1545
1546 c->dst.type = OP_NONE; /* Disable writeback. */
1547 break;
1548 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1549 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1550 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1551 if (rc != 0)
1523 goto done; 1552 goto done;
1524 break; 1553 break;
1554 case 0x90: /* nop / xchg r8,rax */
1555 if (!(c->rex_prefix & 1)) { /* nop */
1556 c->dst.type = OP_NONE;
1557 break;
1558 }
1559 case 0x91 ... 0x97: /* xchg reg,rax */
1560 c->src.type = c->dst.type = OP_REG;
1561 c->src.bytes = c->dst.bytes = c->op_bytes;
1562 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1563 c->src.val = *(c->src.ptr);
1564 goto xchg;
1525 case 0x9c: /* pushf */ 1565 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1566 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1567 emulate_push(ctxt);
@@ -1540,11 +1580,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1580 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1581 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1582 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1583 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1584 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1585 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1586 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1587 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1588 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1589 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1599,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1599 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1601 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1602 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1603 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1604 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1605 &c->src.val,
@@ -1572,7 +1610,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1610 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1611 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1612 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1613 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1614 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1615 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1616 &c->dst.val,
@@ -1596,7 +1634,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1634 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1637 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1639 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1640 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1646,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1646 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1647 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1648 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1649 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1650 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1651 &c->dst.val,
1615 c->dst.bytes, 1652 c->dst.bytes,
@@ -1622,6 +1659,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1659 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1660 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1661 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */
1663 goto mov;
1625 case 0xc0 ... 0xc1: 1664 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1665 emulate_grp2(ctxt);
1627 break; 1666 break;
@@ -1660,13 +1699,39 @@ special_insn:
1660 break; 1699 break;
1661 } 1700 }
1662 case 0xe9: /* jmp rel */ 1701 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1702 goto jmp;
1703 case 0xea: /* jmp far */ {
1704 uint32_t eip;
1705 uint16_t sel;
1706
1707 switch (c->op_bytes) {
1708 case 2:
1709 eip = insn_fetch(u16, 2, c->eip);
1710 break;
1711 case 4:
1712 eip = insn_fetch(u32, 4, c->eip);
1713 break;
1714 default:
1715 DPRINTF("jmp far: Invalid op_bytes\n");
1716 goto cannot_emulate;
1717 }
1718 sel = insn_fetch(u16, 2, c->eip);
1719 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1720 DPRINTF("jmp far: Failed to load CS descriptor\n");
1721 goto cannot_emulate;
1722 }
1723
1724 c->eip = eip;
1725 break;
1726 }
1727 case 0xeb:
1728 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1729 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1730 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1731 break;
1667 case 0xf4: /* hlt */ 1732 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1733 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1734 break;
1670 case 0xf5: /* cmc */ 1735 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1736 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1737 ctxt->eflags ^= EFLG_CF;
@@ -1882,6 +1947,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 1947 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 1948 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 1949 break;
1950 case 0xae: /* clflush */
1951 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 1952 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 1953 /*
1887 * Save real source value, then compare EAX against 1954 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50dad44fb542..0313a5eec412 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -991,7 +991,6 @@ __init void lguest_init(void)
991#ifdef CONFIG_X86_LOCAL_APIC 991#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 992 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 993 pv_apic_ops.apic_write = lguest_apic_write;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read; 994 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 995#endif
997 996
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 48278fa7d3de..3d317836be9e 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,14 +10,6 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/*
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20
21#ifdef CONFIG_HOTPLUG_CPU 13#ifdef CONFIG_HOTPLUG_CPU
22#define DEFAULT_SEND_IPI (1) 14#define DEFAULT_SEND_IPI (1)
23#else 15#else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
37 **/ 29 **/
38void __init pre_intr_init_hook(void) 30void __init pre_intr_init_hook(void)
39{ 31{
40 if (arch_pre_intr_init_quirk) { 32 if (x86_quirks->arch_pre_intr_init) {
41 if (arch_pre_intr_init_quirk()) 33 if (x86_quirks->arch_pre_intr_init())
42 return; 34 return;
43 } 35 }
44 init_ISA_irqs(); 36 init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
64 **/ 56 **/
65void __init intr_init_hook(void) 57void __init intr_init_hook(void)
66{ 58{
67 if (arch_intr_init_quirk) { 59 if (x86_quirks->arch_intr_init) {
68 if (arch_intr_init_quirk()) 60 if (x86_quirks->arch_intr_init())
69 return; 61 return;
70 } 62 }
71#ifdef CONFIG_X86_LOCAL_APIC 63#ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
97 **/ 89 **/
98void __init trap_init_hook(void) 90void __init trap_init_hook(void)
99{ 91{
100 if (arch_trap_init_quirk) { 92 if (x86_quirks->arch_trap_init) {
101 if (arch_trap_init_quirk()) 93 if (x86_quirks->arch_trap_init())
102 return; 94 return;
103 } 95 }
104} 96}
@@ -111,6 +103,16 @@ static struct irqaction irq0 = {
111}; 103};
112 104
113/** 105/**
106 * pre_time_init_hook - do any specific initialisations before.
107 *
108 **/
109void __init pre_time_init_hook(void)
110{
111 if (x86_quirks->arch_pre_time_init)
112 x86_quirks->arch_pre_time_init();
113}
114
115/**
114 * time_init_hook - do any specific initialisations for the system timer. 116 * time_init_hook - do any specific initialisations for the system timer.
115 * 117 *
116 * Description: 118 * Description:
@@ -119,13 +121,13 @@ static struct irqaction irq0 = {
119 **/ 121 **/
120void __init time_init_hook(void) 122void __init time_init_hook(void)
121{ 123{
122 if (arch_time_init_quirk) { 124 if (x86_quirks->arch_time_init) {
123 /* 125 /*
124 * A nonzero return code does not mean failure, it means 126 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any 127 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this: 128 * generic (timer) setup to be performed after this:
127 */ 129 */
128 if (arch_time_init_quirk()) 130 if (x86_quirks->arch_time_init())
129 return; 131 return;
130 } 132 }
131 133
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index 4354ce804889..50189af14b85 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -130,10 +130,10 @@ parse_unisys_oem (char *oemptr)
130 mip_addr = val; 130 mip_addr = val;
131 mip = (struct mip_reg *)val; 131 mip = (struct mip_reg *)val;
132 mip_reg = __va(mip); 132 mip_reg = __va(mip);
133 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 133 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
134 (unsigned long)host_reg); 134 (unsigned long)host_reg);
135 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 135 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
136 (unsigned long)mip_reg); 136 (unsigned long)mip_reg);
137 success++; 137 success++;
138 break; 138 break;
139 case MIP_PSAI_REG: 139 case MIP_PSAI_REG:
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9873716e9f76..2977ea37791f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o
3 3
4obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
4obj-$(CONFIG_X86_32) += pgtable_32.o 5obj-$(CONFIG_X86_32) += pgtable_32.o
5 6
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 7obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
@@ -21,3 +22,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif 22endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 23obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 24
25obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 5dfef9fa061a..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -42,7 +42,6 @@
42 42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
45static bootmem_data_t node0_bdata;
46 45
47/* 46/*
48 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
385 for_each_online_node(nid) 384 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387 386
388 NODE_DATA(0)->bdata = &node0_bdata; 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
389 setup_bootmem_allocator(); 388 setup_bootmem_allocator();
390} 389}
391 390
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0bb0caed8971..a20d1fa64b4e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..3085f25b4355
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,295 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long end = start + (nr_pages << PAGE_SHIFT);
227 unsigned long addr = start;
228 unsigned long next;
229 pgd_t *pgdp;
230 int nr = 0;
231
232 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
233 start, nr_pages*PAGE_SIZE)))
234 goto slow_irqon;
235
236 /*
237 * XXX: batch / limit 'nr', to avoid large irq off latency
238 * needs some instrumenting to determine the common sizes used by
239 * important workloads (eg. DB2), and whether limiting the batch size
240 * will decrease performance.
241 *
242 * It seems like we're in the clear for the moment. Direct-IO is
243 * the main guy that batches up lots of get_user_pages, and even
244 * they are limited to 64-at-a-time which is not so many.
245 */
246 /*
247 * This doesn't prevent pagetable teardown, but does prevent
248 * the pagetables and pages from being freed on x86.
249 *
250 * So long as we atomically load page table pointers versus teardown
251 * (which we do on x86, with the above PAE exception), we can follow the
252 * address down to the the page and take a ref on it.
253 */
254 local_irq_disable();
255 pgdp = pgd_offset(mm, addr);
256 do {
257 pgd_t pgd = *pgdp;
258
259 next = pgd_addr_end(addr, end);
260 if (pgd_none(pgd))
261 goto slow;
262 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
263 goto slow;
264 } while (pgdp++, addr = next, addr != end);
265 local_irq_enable();
266
267 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
268 return nr;
269
270 {
271 int ret;
272
273slow:
274 local_irq_enable();
275slow_irqon:
276 /* Try to get the remaining pages with get_user_pages */
277 start += nr << PAGE_SHIFT;
278 pages += nr;
279
280 down_read(&mm->mmap_sem);
281 ret = get_user_pages(current, mm, start,
282 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
283 up_read(&mm->mmap_sem);
284
285 /* Have to be a bit careful with return values */
286 if (nr > 0) {
287 if (ret < 0)
288 ret = nr;
289 else
290 ret += nr;
291 }
292
293 return ret;
294 }
295}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..d37f29376b0c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
844 reserve_early(table_start << PAGE_SHIFT, 844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE"); 845 table_end << PAGE_SHIFT, "PGTABLE");
846 846
847 if (!after_init_bootmem)
848 early_memtest(start, end);
849
847 return end >> PAGE_SHIFT; 850 return end >> PAGE_SHIFT;
848} 851}
849 852
@@ -868,8 +871,6 @@ void __init paging_init(void)
868 */ 871 */
869 sparse_init(); 872 sparse_init();
870 zone_sizes_init(); 873 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 874}
874 875
875/* 876/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 306049edd553..129618ca0ea2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -86,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on);
86 * around without checking the pgd every time. 86 * around without checking the pgd every time.
87 */ 87 */
88 88
89void show_mem(void)
90{
91 long i, total = 0, reserved = 0;
92 long shared = 0, cached = 0;
93 struct page *page;
94 pg_data_t *pgdat;
95
96 printk(KERN_INFO "Mem-info:\n");
97 show_free_areas();
98 for_each_online_pgdat(pgdat) {
99 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
100 /*
101 * This loop can take a while with 256 GB and
102 * 4k pages so defer the NMI watchdog:
103 */
104 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
105 touch_nmi_watchdog();
106
107 if (!pfn_valid(pgdat->node_start_pfn + i))
108 continue;
109
110 page = pfn_to_page(pgdat->node_start_pfn + i);
111 total++;
112 if (PageReserved(page))
113 reserved++;
114 else if (PageSwapCache(page))
115 cached++;
116 else if (page_count(page))
117 shared += page_count(page) - 1;
118 }
119 }
120 printk(KERN_INFO "%lu pages of RAM\n", total);
121 printk(KERN_INFO "%lu reserved pages\n", reserved);
122 printk(KERN_INFO "%lu pages shared\n", shared);
123 printk(KERN_INFO "%lu pages swap cached\n", cached);
124}
125
126int after_bootmem; 89int after_bootmem;
127 90
128static __init void *spp_getpage(void) 91static __init void *spp_getpage(void)
@@ -517,118 +480,6 @@ static void __init init_gbpages(void)
517 direct_gbpages = 0; 480 direct_gbpages = 0;
518} 481}
519 482
520#ifdef CONFIG_MEMTEST
521
522static void __init memtest(unsigned long start_phys, unsigned long size,
523 unsigned pattern)
524{
525 unsigned long i;
526 unsigned long *start;
527 unsigned long start_bad;
528 unsigned long last_bad;
529 unsigned long val;
530 unsigned long start_phys_aligned;
531 unsigned long count;
532 unsigned long incr;
533
534 switch (pattern) {
535 case 0:
536 val = 0UL;
537 break;
538 case 1:
539 val = -1UL;
540 break;
541 case 2:
542 val = 0x5555555555555555UL;
543 break;
544 case 3:
545 val = 0xaaaaaaaaaaaaaaaaUL;
546 break;
547 default:
548 return;
549 }
550
551 incr = sizeof(unsigned long);
552 start_phys_aligned = ALIGN(start_phys, incr);
553 count = (size - (start_phys_aligned - start_phys))/incr;
554 start = __va(start_phys_aligned);
555 start_bad = 0;
556 last_bad = 0;
557
558 for (i = 0; i < count; i++)
559 start[i] = val;
560 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
561 if (*start != val) {
562 if (start_phys_aligned == last_bad + incr) {
563 last_bad += incr;
564 } else {
565 if (start_bad) {
566 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
567 val, start_bad, last_bad + incr);
568 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
569 }
570 start_bad = last_bad = start_phys_aligned;
571 }
572 }
573 }
574 if (start_bad) {
575 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
576 val, start_bad, last_bad + incr);
577 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
578 }
579
580}
581
582/* default is disabled */
583static int memtest_pattern __initdata;
584
585static int __init parse_memtest(char *arg)
586{
587 if (arg)
588 memtest_pattern = simple_strtoul(arg, NULL, 0);
589 return 0;
590}
591
592early_param("memtest", parse_memtest);
593
594static void __init early_memtest(unsigned long start, unsigned long end)
595{
596 u64 t_start, t_size;
597 unsigned pattern;
598
599 if (!memtest_pattern)
600 return;
601
602 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
603 for (pattern = 0; pattern < memtest_pattern; pattern++) {
604 t_start = start;
605 t_size = 0;
606 while (t_start < end) {
607 t_start = find_e820_area_size(t_start, &t_size, 1);
608
609 /* done ? */
610 if (t_start >= end)
611 break;
612 if (t_start + t_size > end)
613 t_size = end - t_start;
614
615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
618
619 memtest(t_start, t_size, pattern);
620
621 t_start += t_size;
622 }
623 }
624 printk(KERN_CONT "\n");
625}
626#else
627static void __init early_memtest(unsigned long start, unsigned long end)
628{
629}
630#endif
631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start, 483static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end, 484 unsigned long end,
634 unsigned long page_size_mask) 485 unsigned long page_size_mask)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 24c1d3c30186..016f335bbeea 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
330 return (void __iomem *)ret; 330 return (void __iomem *)ret;
331} 331}
332 332
333void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
334 unsigned long prot_val)
335{
336 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
337 __builtin_return_address(0));
338}
339EXPORT_SYMBOL(ioremap_prot);
340
333/** 341/**
334 * iounmap - Free a IO remapping 342 * iounmap - Free a IO remapping
335 * @addr: virtual address from ioremap_* 343 * @addr: virtual address from ioremap_*
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index b432d5781773..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,15 +20,9 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -202,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
202 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
203 197
204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
206 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
208 202
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d4585077977a..2fe30916d4b6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -373,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
373 return vma_prot; 375 return vma_prot;
374} 376}
375 377
376#ifdef CONFIG_NONPROMISC_DEVMEM 378#ifdef CONFIG_STRICT_DEVMEM
377/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 379/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
378static inline int range_is_allowed(unsigned long pfn, unsigned long size) 380static inline int range_is_allowed(unsigned long pfn, unsigned long size)
379{ 381{
380 return 1; 382 return 1;
@@ -398,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
398 } 400 }
399 return 1; 401 return 1;
400} 402}
401#endif /* CONFIG_NONPROMISC_DEVMEM */ 403#endif /* CONFIG_STRICT_DEVMEM */
402 404
403int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 405int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
404 unsigned long size, pgprot_t *vma_prot) 406 unsigned long size, pgprot_t *vma_prot)
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
489 491
490 free_memtype(addr, addr + size); 492 free_memtype(addr, addr + size);
491} 493}
494
495#if defined(CONFIG_DEBUG_FS)
496
497/* get Nth element of the linked list */
498static struct memtype *memtype_get_idx(loff_t pos)
499{
500 struct memtype *list_node, *print_entry;
501 int i = 1;
502
503 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
504 if (!print_entry)
505 return NULL;
506
507 spin_lock(&memtype_lock);
508 list_for_each_entry(list_node, &memtype_list, nd) {
509 if (pos == i) {
510 *print_entry = *list_node;
511 spin_unlock(&memtype_lock);
512 return print_entry;
513 }
514 ++i;
515 }
516 spin_unlock(&memtype_lock);
517 kfree(print_entry);
518 return NULL;
519}
520
521static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
522{
523 if (*pos == 0) {
524 ++*pos;
525 seq_printf(seq, "PAT memtype list:\n");
526 }
527
528 return memtype_get_idx(*pos);
529}
530
531static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
532{
533 ++*pos;
534 return memtype_get_idx(*pos);
535}
536
537static void memtype_seq_stop(struct seq_file *seq, void *v)
538{
539}
540
541static int memtype_seq_show(struct seq_file *seq, void *v)
542{
543 struct memtype *print_entry = (struct memtype *)v;
544
545 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
546 print_entry->start, print_entry->end);
547 kfree(print_entry);
548 return 0;
549}
550
551static struct seq_operations memtype_seq_ops = {
552 .start = memtype_seq_start,
553 .next = memtype_seq_next,
554 .stop = memtype_seq_stop,
555 .show = memtype_seq_show,
556};
557
558static int memtype_seq_open(struct inode *inode, struct file *file)
559{
560 return seq_open(file, &memtype_seq_ops);
561}
562
563static const struct file_operations memtype_fops = {
564 .open = memtype_seq_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = seq_release,
568};
569
570static int __init pat_memtype_list_init(void)
571{
572 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
573 NULL, &memtype_fops);
574 return 0;
575}
576
577late_initcall(pat_memtype_list_init);
578
579#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7f3329b55d2e..3f90289410e6 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -369,20 +369,34 @@ static int __init ppro_init(char **cpu_type)
369{ 369{
370 __u8 cpu_model = boot_cpu_data.x86_model; 370 __u8 cpu_model = boot_cpu_data.x86_model;
371 371
372 if (cpu_model == 14) 372 switch (cpu_model) {
373 case 0 ... 2:
374 *cpu_type = "i386/ppro";
375 break;
376 case 3 ... 5:
377 *cpu_type = "i386/pii";
378 break;
379 case 6 ... 8:
380 *cpu_type = "i386/piii";
381 break;
382 case 9:
383 *cpu_type = "i386/p6_mobile";
384 break;
385 case 10 ... 13:
386 *cpu_type = "i386/p6";
387 break;
388 case 14:
373 *cpu_type = "i386/core"; 389 *cpu_type = "i386/core";
374 else if (cpu_model == 15 || cpu_model == 23) 390 break;
391 case 15: case 23:
392 *cpu_type = "i386/core_2";
393 break;
394 case 26:
375 *cpu_type = "i386/core_2"; 395 *cpu_type = "i386/core_2";
376 else if (cpu_model > 0xd) 396 break;
397 default:
398 /* Unknown */
377 return 0; 399 return 0;
378 else if (cpu_model == 9) {
379 *cpu_type = "i386/p6_mobile";
380 } else if (cpu_model > 5) {
381 *cpu_type = "i386/piii";
382 } else if (cpu_model > 2) {
383 *cpu_type = "i386/pii";
384 } else {
385 *cpu_type = "i386/ppro";
386 } 400 }
387 401
388 model = &op_ppro_spec; 402 model = &op_ppro_spec;
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e515e8db842a..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7 7
8pci-y := fixup.o 8obj-y += fixup.o
9pci-$(CONFIG_ACPI) += acpi.o 9obj-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o 10obj-y += legacy.o irq.o
11 11
12pci-$(CONFIG_X86_VISWS) += visws.o 12obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14pci-$(CONFIG_X86_NUMAQ) += numa.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += $(pci-y) common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 858dbe3399f9..86631ccbc25a 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -7,15 +7,13 @@
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
9 9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) 10u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{ 11{
14 u32 v; 12 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc); 14 v = inl(0xcfc);
17 if (v != 0xffffffff) 15 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); 16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v; 17 return v;
20} 18}
21 19
@@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
24 u8 v; 22 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3)); 24 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); 25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v; 26 return v;
29} 27}
30 28
@@ -33,28 +31,28 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
33 u16 v; 31 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2)); 33 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); 34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v; 35 return v;
38} 36}
39 37
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val) 39 u32 val)
42{ 40{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val); 41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc); 43 outl(val, 0xcfc);
46} 44}
47 45
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 47{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc + (offset&3)); 50 outb(val, 0xcfc + (offset&3));
53} 51}
54 52
55void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
56{ 54{
57 PDprintk("%x writing to %x: %x\n", slot, offset, val); 55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
58 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
59 outw(val, 0xcfc + (offset&2)); 57 outw(val, 0xcfc + (offset&2));
60} 58}
@@ -71,7 +69,7 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
71 int j; 69 int j;
72 u32 val; 70 u32 val;
73 71
74 printk("PCI: %02x:%02x:%02x", bus, slot, func); 72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
75 73
76 for (i = 0; i < 256; i += 4) { 74 for (i = 0; i < 256; i += 4) {
77 if (!(i & 0x0f)) 75 if (!(i & 0x0f))
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2aafb67dc5f1..a09505806b82 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -280,6 +280,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 280static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 281 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 282 .close = pci_unmap_page_range,
283 .access = generic_access_phys,
283}; 284};
284 285
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 286int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 132876cc6fca..ec9ce35e44d6 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
57 57
58int __init pci_subsys_init(void) 58int __init pci_subsys_init(void)
59{ 59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 pci_acpi_init(); 64 pci_acpi_init();
62#endif 65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
63 pci_legacy_init(); 69 pci_legacy_init();
64 pcibios_irq_init(); 70 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init(); 71 pcibios_init();
69 72
70 return 0; 73 return 0;
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index 8b5ca1966731..f4b16dc11dad 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 151}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 153
154int __init pci_numa_init(void) 154int __init pci_numaq_init(void)
155{ 155{
156 int quad; 156 int quad;
157 157
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3e25deb821ac..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
108/* some common used subsys_initcalls */ 108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void); 109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void); 110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void); 111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
112extern int __init pcibios_init(void); 113extern int __init pcibios_init(void);
113 114
114/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 1a7bed492bb1..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
87} 87}
88 88
89static int __init pci_visws_init(void) 89int __init pci_visws_init(void)
90{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
91 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
92 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
93 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
105 pcibios_resource_survey(); 111 pcibios_resource_survey();
106 return 0; 112 return 0;
107} 113}
108
109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 19a6cfaf5db9..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..9ff6e3cbf08f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -167,10 +180,14 @@ void xen_vcpu_restore(void)
167 180
168static void __init xen_banner(void) 181static void __init xen_banner(void)
169{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
170 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
171 pv_info.name); 188 pv_info.name);
172 printk(KERN_INFO "Hypervisor signature: %s%s\n", 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
173 xen_start_info->magic, 190 version >> 16, version & 0xffff, extra.extraversion,
174 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
175} 192}
176 193
@@ -363,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 380
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 382{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 383 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -379,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
382 */ 399 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
385} 422}
423#endif
386 424
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 426 const void *ptr)
@@ -400,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 438 preempt_enable();
401} 439}
402 440
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 442 struct trap_info *info)
405{ 443{
406 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 445 return 0;
413 446
414 info->vector = vector; 447 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
416 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
417 info->flags = dpl; 450 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
419 if (type == 0xe) 452 if (val->type == 0xe)
420 info->flags |= 4; 453 info->flags |= 4;
421 454
422 return 1; 455 return 1;
@@ -443,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 476
444 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 478 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 479
448 info[1].address = 0; 480 info[1].address = 0;
449 481
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 484 BUG();
453 } 485 }
@@ -460,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 492{
461 unsigned in, out, count; 493 unsigned in, out, count;
462 494
463 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 496 BUG_ON(count > 256);
465 497
466 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 500
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 502 out++;
471 } 503 }
472 traps[out].address = 0; 504 traps[out].address = 0;
@@ -695,33 +727,89 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 728}
697 729
698static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 731{
700 struct mmuext_op *op; 732 struct mmuext_op *op;
701 struct multicall_space mcs; 733 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
703 735
704 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
705 740
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
707 742
708 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 744
712 op = mcs.args; 745 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
715 748
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 750
718 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
719 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
721 781
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 783}
724 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
725/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 814 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 867}
780 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
781/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
783{ 913{
@@ -803,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
803 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
804} 934}
805 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
806#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 950{
@@ -841,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
841 983
842static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 985{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 986}
890 987
891void xen_setup_shared_info(void) 988void xen_setup_shared_info(void)
892{ 989{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 992 xen_start_info->shared_info);
896 /* 993
897 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 996 } else
907 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1007,32 @@ void xen_setup_shared_info(void)
917 1007
918static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1009{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1010 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1011}
934 1012
935static __init void xen_post_allocator_init(void) 1013static __init void xen_post_allocator_init(void)
936{ 1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1016 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
1022 /* This will work as long as patching hasn't happened yet
1023 (which it hasn't) */
1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1026 pv_mmu_ops.release_pte = xen_release_pte;
1027 pv_mmu_ops.release_pmd = xen_release_pmd;
1028#if PAGETABLE_LEVELS == 4
1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
939 1032
1033#ifdef CONFIG_X86_64
1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
1035#endif
940 xen_mark_init_mm_pinned(); 1036 xen_mark_init_mm_pinned();
941} 1037}
942 1038
@@ -950,6 +1046,7 @@ void xen_setup_vcpu_info_placement(void)
950 1046
951 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
952 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
953 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
954 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
955 1052
@@ -959,6 +1056,7 @@ void xen_setup_vcpu_info_placement(void)
959 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
960 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
961 } 1058 }
1059#endif
962} 1060}
963 1061
964static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
979 goto patch_site 1077 goto patch_site
980 1078
981 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
982 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
983 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
984 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
985 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
986#undef SITE 1086#undef SITE
987 1087
988 patch_site: 1088 patch_site:
@@ -1025,8 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1125#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1126 case FIX_F00F_IDT:
1027#endif 1127#endif
1128#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1129 case FIX_WP_TEST:
1029 case FIX_VDSO: 1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1137#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1139#endif
@@ -1039,6 +1146,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1146 }
1040 1147
1041 __native_set_fixmap(idx, pte); 1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1042} 1158}
1043 1159
1044static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
1085 1201
1086 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
1090 1206
1091 .iret = xen_iret, 1207 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
1093 1213
1094 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1099 1222
1100 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1109,14 +1232,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1111 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1112 .lazy_mode = { 1238 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1115 }, 1241 },
1116}; 1242};
1117 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1118static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1120 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
@@ -1124,14 +1267,13 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1124 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt, 1268 .halt = xen_halt,
1126#ifdef CONFIG_X86_64 1269#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop, 1270 .adjust_exception_frame = xen_adjust_exception_frame,
1128#endif 1271#endif
1129}; 1272};
1130 1273
1131static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1132#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1133 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1134 .apic_write_atomic = xen_apic_write,
1135 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1136 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1137 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1157,8 +1299,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1159 1301
1160 .pgd_alloc = __paravirt_pgd_alloc, 1302 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1303 .pgd_free = xen_pgd_free,
1162 1304
1163 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
@@ -1170,7 +1312,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1313#endif
1172 1314
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1174 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1321 .set_pmd = xen_set_pmd_hyper,
1176 1322
@@ -1184,15 +1330,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1184 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1186 1332
1333#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1192 1340
1193 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1195 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1196 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1362,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1362 .set_fixmap = xen_set_fixmap,
1206}; 1363};
1207 1364
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1224{ 1366{
1225 struct sched_shutdown r = { .reason = reason }; 1367 struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1406
1265static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1266{ 1408{
1409#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1269 1412
@@ -1271,8 +1414,248 @@ static void __init xen_reserve_top(void)
1271 top = pp.virt_start; 1414 top = pp.virt_start;
1272 1415
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1274} 1431}
1275 1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_PFN_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1442}
1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1276/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1278{ 1661{
@@ -1301,53 +1684,56 @@ asmlinkage void __init xen_start_kernel(void)
1301 1684
1302 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1303 1686
1304#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1306#endif 1691#endif
1307 1692
1693 xen_smp_init();
1694
1308 /* Get mfn list */ 1695 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1697 xen_build_dynamic_phys_to_machine();
1311 1698
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1700
1314 init_pg_tables_start = __pa(pgd); 1701 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1703 if (!is_initial_xendomain())
1317 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1705
1325 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1329 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1332 1720
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1339 xen_reserve_top(); 1722 xen_reserve_top();
1340 1723
1724#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1345 1730
1346 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1737
1352 if (!is_initial_xendomain()) { 1738 if (!is_initial_xendomain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1739 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1741,21 @@ asmlinkage void __init xen_start_kernel(void)
1355 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1356 } 1742 }
1357 1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1358 /* Start the world */ 1755 /* Start the world */
1756#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1360} 1761}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..aa37469da696 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -364,8 +343,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
364static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
365{ 344{
366 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
367 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
368 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & PTE_FLAGS_MASK;
369 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
370 } 349 }
371 350
@@ -375,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
375static pteval_t pte_pfn_to_mfn(pteval_t val) 354static pteval_t pte_pfn_to_mfn(pteval_t val)
376{ 355{
377 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
378 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
379 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & PTE_FLAGS_MASK;
380 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
381 } 360 }
382 361
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..d8faf79a0a1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,6 +36,8 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
38cpumask_t xen_cpu_initialized_map; 41cpumask_t xen_cpu_initialized_map;
39 42
40static DEFINE_PER_CPU(int, resched_irq); 43static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
67 70
68 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
69 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
70 76
71 preempt_disable(); 77 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
73 81
74 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
75 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
76 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 89 local_irq_enable();
78 90
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 153 return rc;
142} 154}
143 155
144void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
145{ 157{
146 int i, rc; 158 int i, rc;
147 159
148 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
151 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
152 } 166 }
153} 167}
154 168
155void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
156{ 170{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
161 173
162 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 175 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 177
176 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
177} 179}
178 180
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 182{
181 unsigned cpu; 183 unsigned cpu;
182 184
183 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 186
193 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
195 190
196 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 221{
227 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
229 224
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 226 return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 229 if (ctxt == NULL)
235 return -ENOMEM; 230 return -ENOMEM;
236 231
232 gdt = get_cpu_gdt_table(cpu);
233
237 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 243
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 247
250 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
251 249
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
254 252
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
257 255
258 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
263 261
262#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 268
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 276 return 0;
277} 277}
278 278
279int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 280{
281 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
282 int rc; 282 int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 287 return rc;
288#endif 288#endif
289 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
290 init_gdt(cpu); 301 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
293 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 312
295 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 324 if (rc)
307 return rc; 325 return rc;
308 326
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 328 BUG_ON(rc);
318 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
319 return 0; 335 return 0;
320} 336}
321 337
322void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 339{
324} 340}
325 341
@@ -335,12 +351,12 @@ static void stop_self(void *v)
335 BUG(); 351 BUG();
336} 352}
337 353
338void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
339{ 355{
340 smp_call_function(stop_self, NULL, 0); 356 smp_call_function(stop_self, NULL, 0);
341} 357}
342 358
343void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
344{ 360{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 362}
@@ -351,18 +367,18 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
351 367
352 cpus_and(mask, mask, cpu_online_map); 368 cpus_and(mask, mask, cpu_online_map);
353 369
354 for_each_cpu_mask(cpu, mask) 370 for_each_cpu_mask_nr(cpu, mask)
355 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
356} 372}
357 373
358void xen_smp_send_call_function_ipi(cpumask_t mask) 374static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 375{
360 int cpu; 376 int cpu;
361 377
362 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
363 379
364 /* Make sure other vcpus get a chance to run if they need to. */ 380 /* Make sure other vcpus get a chance to run if they need to. */
365 for_each_cpu_mask(cpu, mask) { 381 for_each_cpu_mask_nr(cpu, mask) {
366 if (xen_vcpu_stolen(cpu)) { 382 if (xen_vcpu_stolen(cpu)) {
367 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 383 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
368 break; 384 break;
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 386 }
371} 387}
372 388
373void xen_smp_send_call_function_single_ipi(int cpu) 389static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 390{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 392}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 395{
380 irq_enter(); 396 irq_enter();
381 generic_smp_call_function_interrupt(); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
383 irq_exit(); 403 irq_exit();
384 404
385 return IRQ_HANDLED; 405 return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 409{
390 irq_enter(); 410 irq_enter();
391 generic_smp_call_function_single_interrupt(); 411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
393 irq_exit(); 417 irq_exit();
394 418
395 return IRQ_HANDLED; 419 return IRQ_HANDLED;
396} 420}
421
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
591
592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..7f58304fafb3
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $0
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */