aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-03 13:28:46 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-03 13:28:46 -0400
commitf68ec0c24755e5cdb779be6240925f2175311d84 (patch)
treea7b7128e61a8456385d82bd1c7ca5f14eecbf2ca /arch/x86
parent98920dc3d1113b883cbc73e3293446d3525c6042 (diff)
parent94aca1dac6f6d21f4b07e4864baf7768cabcc6e7 (diff)
Merge commit 'v2.6.27-rc8' into x86/setup
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig73
-rw-r--r--arch/x86/Kconfig.cpu19
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/compressed/misc.c39
-rw-r--r--arch/x86/boot/cpu.c3
-rw-r--r--arch/x86/boot/cpucheck.c18
-rw-r--r--arch/x86/boot/main.c5
-rw-r--r--arch/x86/boot/memory.c1
-rw-r--r--arch/x86/configs/i386_defconfig305
-rw-r--r--arch/x86/configs/x86_64_defconfig260
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/ia32/ia32_signal.c11
-rw-r--r--arch/x86/ia32/ia32entry.S115
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/acpi/sleep.c8
-rw-r--r--arch/x86/kernel/alternative.c36
-rw-r--r--arch/x86/kernel/amd_iommu.c283
-rw-r--r--arch/x86/kernel/amd_iommu_init.c385
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic_32.c197
-rw-r--r--arch/x86/kernel/apic_64.c33
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c17
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c29
-rw-r--r--arch/x86/kernel/cpu/centaur.c11
-rw-r--r--arch/x86/kernel/cpu/common.c18
-rw-r--r--arch/x86/kernel/cpu/common_64.c87
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c149
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c4
-rw-r--r--arch/x86/kernel/cpu/cyrix.c50
-rw-r--r--arch/x86/kernel/cpu/feature_names.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c5
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c12
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c19
-rw-r--r--arch/x86/kernel/e820.c35
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/entry_32.S79
-rw-r--r--arch/x86/kernel/entry_64.S175
-rw-r--r--arch/x86/kernel/genapic_64.c1
-rw-r--r--arch/x86/kernel/genapic_flat_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c37
-rw-r--r--arch/x86/kernel/head64.c12
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c53
-rw-r--r--arch/x86/kernel/io_apic_32.c59
-rw-r--r--arch/x86/kernel/io_apic_64.c78
-rw-r--r--arch/x86/kernel/io_delay.c11
-rw-r--r--arch/x86/kernel/ipi.c6
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c43
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c57
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c14
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c223
-rw-r--r--arch/x86/kernel/msr.c44
-rw-r--r--arch/x86/kernel/nmi.c39
-rw-r--r--arch/x86/kernel/numaq_32.c197
-rw-r--r--arch/x86/kernel/paravirt.c31
-rw-r--r--arch/x86/kernel/pci-calgary_64.c174
-rw-r--r--arch/x86/kernel/pci-dma.c180
-rw-r--r--arch/x86/kernel/pci-gart_64.c19
-rw-r--r--arch/x86/kernel/pci-nommu.c16
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c4
-rw-r--r--arch/x86/kernel/process.c22
-rw-r--r--arch/x86/kernel/process_32.c9
-rw-r--r--arch/x86/kernel/process_64.c65
-rw-r--r--arch/x86/kernel/ptrace.c151
-rw-r--r--arch/x86/kernel/reboot.c11
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/setup.c71
-rw-r--r--arch/x86/kernel/setup_percpu.c27
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c71
-rw-r--r--arch/x86/kernel/smpboot.c180
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/traps_32.c118
-rw-r--r--arch/x86/kernel/traps_64.c57
-rw-r--r--arch/x86/kernel/tsc.c242
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/visws_quirks.c48
-rw-r--r--arch/x86/kernel/vmi_32.c4
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S8
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c24
-rw-r--r--arch/x86/kvm/i8259.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c14
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c173
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h42
-rw-r--r--arch/x86/kvm/svm.c153
-rw-r--r--arch/x86/kvm/vmx.c251
-rw-r--r--arch/x86/kvm/vmx.h14
-rw-r--r--arch/x86/kvm/x86.c430
-rw-r--r--arch/x86/kvm/x86_emulate.c257
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S3
-rw-r--r--arch/x86/lib/msr-on-cpu.c22
-rw-r--r--arch/x86/mach-default/setup.c34
-rw-r--r--arch/x86/mach-es7000/es7000plat.c8
-rw-r--r--arch/x86/mach-rdc321x/platform.c1
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/discontig_32.c3
-rw-r--r--arch/x86/mm/dump_pagetables.c10
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c9
-rw-r--r--arch/x86/mm/init_64.c197
-rw-r--r--arch/x86/mm/ioremap.c18
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa_64.c8
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pageattr.c27
-rw-r--r--arch/x86/mm/pat.c144
-rw-r--r--arch/x86/mm/pgtable.c3
-rw-r--r--arch/x86/mm/pgtable_32.c47
-rw-r--r--arch/x86/mm/srat_32.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c79
-rw-r--r--arch/x86/pci/Makefile12
-rw-r--r--arch/x86/pci/amd_bus.c52
-rw-r--r--arch/x86/pci/early.c16
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c24
-rw-r--r--arch/x86/pci/irq.c108
-rw-r--r--arch/x86/pci/legacy.c11
-rw-r--r--arch/x86/pci/mmconfig-shared.c67
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)9
-rw-r--r--arch/x86/pci/pci.h3
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/power/cpu_32.c6
-rw-r--r--arch/x86/power/hibernate_asm_32.S26
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vma.c11
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c699
-rw-r--r--arch/x86/xen/mmu.c324
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c81
-rw-r--r--arch/x86/xen/smp.c310
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
190 files changed, 6835 insertions, 3381 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96e0c2ebc388..ed92864d1325 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,12 +21,16 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_IOREMAP_PROT
24 select HAVE_KPROBES 25 select HAVE_KPROBES
26 select ARCH_WANT_OPTIONAL_GPIOLIB
25 select HAVE_KRETPROBES 27 select HAVE_KRETPROBES
26 select HAVE_DYNAMIC_FTRACE 28 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE 29 select HAVE_FTRACE
28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
29 select HAVE_ARCH_KGDB if !X86_VOYAGER 31 select HAVE_ARCH_KGDB if !X86_VOYAGER
32 select HAVE_GENERIC_DMA_COHERENT if X86_32
33 select HAVE_EFFICIENT_UNALIGNED_ACCESS
30 34
31config ARCH_DEFCONFIG 35config ARCH_DEFCONFIG
32 string 36 string
@@ -329,20 +333,6 @@ config X86_BIGSMP
329 333
330endif 334endif
331 335
332config X86_RDC321X
333 bool "RDC R-321x SoC"
334 depends on X86_32
335 select M486
336 select X86_REBOOTFIXUPS
337 select GENERIC_GPIO
338 select LEDS_CLASS
339 select LEDS_GPIO
340 select NEW_LEDS
341 help
342 This option is needed for RDC R-321x system-on-chip, also known
343 as R-8610-(G).
344 If you don't have one of these chips, you should say N here.
345
346config X86_VSMP 336config X86_VSMP
347 bool "Support for ScaleMP vSMP" 337 bool "Support for ScaleMP vSMP"
348 select PARAVIRT 338 select PARAVIRT
@@ -366,6 +356,16 @@ config X86_VISWS
366 A kernel compiled for the Visual Workstation will run on general 356 A kernel compiled for the Visual Workstation will run on general
367 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 357 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
368 358
359config X86_RDC321X
360 bool "RDC R-321x SoC"
361 depends on X86_32
362 select M486
363 select X86_REBOOTFIXUPS
364 help
365 This option is needed for RDC R-321x system-on-chip, also known
366 as R-8610-(G).
367 If you don't have one of these chips, you should say N here.
368
369config SCHED_NO_NO_OMIT_FRAME_POINTER 369config SCHED_NO_NO_OMIT_FRAME_POINTER
370 def_bool y 370 def_bool y
371 prompt "Single-depth WCHAN output" 371 prompt "Single-depth WCHAN output"
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
447 447
448config MEMTEST 448config MEMTEST
449 bool "Memtest" 449 bool "Memtest"
450 depends on X86_64
451 help 450 help
452 This option adds a kernel parameter 'memtest', which allows memtest 451 This option adds a kernel parameter 'memtest', which allows memtest
453 to be set. 452 to be set.
@@ -578,35 +577,29 @@ config SWIOTLB
578 577
579config IOMMU_HELPER 578config IOMMU_HELPER
580 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) 579 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
580
581config MAXSMP 581config MAXSMP
582 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 582 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
583 depends on X86_64 && SMP 583 depends on X86_64 && SMP && BROKEN
584 default n 584 default n
585 help 585 help
586 Configure maximum number of CPUS and NUMA Nodes for this architecture. 586 Configure maximum number of CPUS and NUMA Nodes for this architecture.
587 If unsure, say N. 587 If unsure, say N.
588 588
589if MAXSMP
590config NR_CPUS 589config NR_CPUS
591 int 590 int "Maximum number of CPUs (2-512)" if !MAXSMP
592 default "4096" 591 range 2 512
593endif
594
595if !MAXSMP
596config NR_CPUS
597 int "Maximum number of CPUs (2-4096)"
598 range 2 4096
599 depends on SMP 592 depends on SMP
593 default "4096" if MAXSMP
600 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 594 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
601 default "8" 595 default "8"
602 help 596 help
603 This allows you to specify the maximum number of CPUs which this 597 This allows you to specify the maximum number of CPUs which this
604 kernel will support. The maximum supported value is 4096 and the 598 kernel will support. The maximum supported value is 512 and the
605 minimum value which makes sense is 2. 599 minimum value which makes sense is 2.
606 600
607 This is purely to save memory - each supported CPU adds 601 This is purely to save memory - each supported CPU adds
608 approximately eight kilobytes to the kernel image. 602 approximately eight kilobytes to the kernel image.
609endif
610 603
611config SCHED_SMT 604config SCHED_SMT
612 bool "SMT (Hyperthreading) scheduler support" 605 bool "SMT (Hyperthreading) scheduler support"
@@ -952,9 +945,9 @@ config NUMA
952 local memory controller of the CPU and add some more 945 local memory controller of the CPU and add some more
953 NUMA awareness to the kernel. 946 NUMA awareness to the kernel.
954 947
955 For i386 this is currently highly experimental and should be only 948 For 32-bit this is currently highly experimental and should be only
956 used for kernel development. It might also cause boot failures. 949 used for kernel development. It might also cause boot failures.
957 For x86_64 this is recommended on all multiprocessor Opteron systems. 950 For 64-bit this is recommended on all multiprocessor Opteron systems.
958 If the system is EM64T, you should say N unless your system is 951 If the system is EM64T, you should say N unless your system is
959 EM64T NUMA. 952 EM64T NUMA.
960 953
@@ -997,17 +990,10 @@ config NUMA_EMU
997 into virtual nodes when booted with "numa=fake=N", where N is the 990 into virtual nodes when booted with "numa=fake=N", where N is the
998 number of nodes. This is only useful for debugging. 991 number of nodes. This is only useful for debugging.
999 992
1000if MAXSMP
1001
1002config NODES_SHIFT
1003 int
1004 default "9"
1005endif
1006
1007if !MAXSMP
1008config NODES_SHIFT 993config NODES_SHIFT
1009 int "Maximum NUMA Nodes (as a power of 2)" 994 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1010 range 1 9 if X86_64 995 range 1 9 if X86_64
996 default "9" if MAXSMP
1011 default "6" if X86_64 997 default "6" if X86_64
1012 default "4" if X86_NUMAQ 998 default "4" if X86_NUMAQ
1013 default "3" 999 default "3"
@@ -1015,7 +1001,6 @@ config NODES_SHIFT
1015 help 1001 help
1016 Specify the maximum number of NUMA Nodes available on the target 1002 Specify the maximum number of NUMA Nodes available on the target
1017 system. Increases memory reserved to accomodate various tables. 1003 system. Increases memory reserved to accomodate various tables.
1018endif
1019 1004
1020config HAVE_ARCH_BOOTMEM_NODE 1005config HAVE_ARCH_BOOTMEM_NODE
1021 def_bool y 1006 def_bool y
@@ -1264,7 +1249,7 @@ config KEXEC
1264 strongly in flux, so no good recommendation can be made. 1249 strongly in flux, so no good recommendation can be made.
1265 1250
1266config CRASH_DUMP 1251config CRASH_DUMP
1267 bool "kernel crash dumps (EXPERIMENTAL)" 1252 bool "kernel crash dumps"
1268 depends on X86_64 || (X86_32 && HIGHMEM) 1253 depends on X86_64 || (X86_32 && HIGHMEM)
1269 help 1254 help
1270 Generate crash dump after being started by kexec. 1255 Generate crash dump after being started by kexec.
@@ -1277,6 +1262,14 @@ config CRASH_DUMP
1277 (CONFIG_RELOCATABLE=y). 1262 (CONFIG_RELOCATABLE=y).
1278 For more details see Documentation/kdump/kdump.txt 1263 For more details see Documentation/kdump/kdump.txt
1279 1264
1265config KEXEC_JUMP
1266 bool "kexec jump (EXPERIMENTAL)"
1267 depends on EXPERIMENTAL
1268 depends on KEXEC && HIBERNATION && X86_32
1269 help
1270 Jump between original kernel and kexeced kernel and invoke
1271 code in physical address mode via KEXEC
1272
1280config PHYSICAL_START 1273config PHYSICAL_START
1281 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1274 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1282 default "0x1000000" if X86_NUMAQ 1275 default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index abff1b84ed5b..b225219c448c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
362 def_bool y 362 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368
369config X86_INTEL_USERCOPY 365config X86_INTEL_USERCOPY
370 def_bool y 366 def_bool y
371 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 367 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
@@ -386,14 +382,17 @@ config X86_OOSTORE
386# P6_NOPs are a relatively minor optimization that require a family >= 382# P6_NOPs are a relatively minor optimization that require a family >=
387# 6 processor, except that it is broken on certain VIA chips. 383# 6 processor, except that it is broken on certain VIA chips.
388# Furthermore, AMD chips prefer a totally different sequence of NOPs 384# Furthermore, AMD chips prefer a totally different sequence of NOPs
389# (which work on all CPUs). As a result, disallow these if we're 385# (which work on all CPUs). In addition, it looks like Virtual PC
390# compiling X86_GENERIC but not X86_64 (these NOPs do work on all 386# does not understand them.
391# x86-64 capable chips); the list of processors in the right-hand clause 387#
392# are the cores that benefit from this optimization. 388# As a result, disallow these if we're not compiling for X86_64 (these
389# NOPs do work on all x86-64 capable chips); the list of processors in
390# the right-hand clause are the cores that benefit from this optimization.
393# 391#
394config X86_P6_NOP 392config X86_P6_NOP
395 def_bool y 393 def_bool y
396 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) 394 depends on X86_64
395 depends on (MCORE2 || MPENTIUM4 || MPSC)
397 396
398config X86_TSC 397config X86_TSC
399 def_bool y 398 def_bool y
@@ -418,4 +417,4 @@ config X86_MINIMUM_CPU_FAMILY
418 417
419config X86_DEBUGCTLMSR 418config X86_DEBUGCTLMSR
420 def_bool y 419 def_bool y
421 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ae36bfa814e5..092f019e033a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -287,7 +289,6 @@ config CPA_DEBUG
287 289
288config OPTIMIZE_INLINING 290config OPTIMIZE_INLINING
289 bool "Allow gcc to uninline functions marked 'inline'" 291 bool "Allow gcc to uninline functions marked 'inline'"
290 depends on BROKEN
291 help 292 help
292 This option determines if the kernel forces gcc to inline the functions 293 This option determines if the kernel forces gcc to inline the functions
293 developers have marked 'inline'. Doing so takes away freedom from gcc to 294 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
298 become the default in the future, until then this option is there to 299 become the default in the future, until then this option is there to
299 test gcc for this. 300 test gcc for this.
300 301
302 If unsure, say N.
303
301endmenu 304endmenu
302 305
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea654..f5631da585b6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# RDC R-321x subarch support
122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
124core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
125
126# default subarch .h files 121# default subarch .h files
127mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iinclude/asm-x86/mach-default
128 123
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7cb..cc0ef13fba7a 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,10 +24,14 @@
24#include <linux/edd.h> 24#include <linux/edd.h>
25#include <asm/boot.h> 25#include <asm/boot.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include "bitops.h"
28#include <asm/cpufeature.h>
27 29
28/* Useful macros */ 30/* Useful macros */
29#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
30 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
34
31extern struct setup_header hdr; 35extern struct setup_header hdr;
32extern struct boot_params boot_params; 36extern struct boot_params boot_params;
33 37
@@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
242int cmdline_find_option_bool(const char *option); 246int cmdline_find_option_bool(const char *option);
243 247
244/* cpu.c, cpucheck.c */ 248/* cpu.c, cpucheck.c */
249struct cpu_features {
250 int level; /* Family, or 64 for x86-64 */
251 int model;
252 u32 flags[NCAPINTS];
253};
254extern struct cpu_features cpu;
245int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 255int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
246int validate_cpu(void); 256int validate_cpu(void);
247 257
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index bc5553b496f7..9fea73706479 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -182,8 +182,6 @@ static unsigned outcnt;
182static int fill_inbuf(void); 182static int fill_inbuf(void);
183static void flush_window(void); 183static void flush_window(void);
184static void error(char *m); 184static void error(char *m);
185static void gzip_mark(void **);
186static void gzip_release(void **);
187 185
188/* 186/*
189 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
@@ -196,9 +194,6 @@ extern int input_len;
196 194
197static long bytes_out; 195static long bytes_out;
198 196
199static void *malloc(int size);
200static void free(void *where);
201
202static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
203static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
204 199
@@ -220,40 +215,6 @@ static int lines, cols;
220 215
221#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
222 217
223static void *malloc(int size)
224{
225 void *p;
226
227 if (size < 0)
228 error("Malloc error");
229 if (free_mem_ptr <= 0)
230 error("Memory error");
231
232 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
233
234 p = (void *)free_mem_ptr;
235 free_mem_ptr += size;
236
237 if (free_mem_ptr >= free_mem_end_ptr)
238 error("Out of memory");
239
240 return p;
241}
242
243static void free(void *where)
244{ /* Don't care */
245}
246
247static void gzip_mark(void **ptr)
248{
249 *ptr = (void *) free_mem_ptr;
250}
251
252static void gzip_release(void **ptr)
253{
254 free_mem_ptr = (memptr) *ptr;
255}
256
257static void scroll(void) 218static void scroll(void)
258{ 219{
259 int i; 220 int i;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 92d6fd73dc7d..75298fe2edca 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
16 */ 16 */
17 17
18#include "boot.h" 18#include "boot.h"
19#include "bitops.h"
20#include <asm/cpufeature.h>
21
22#include "cpustr.h" 19#include "cpustr.h"
23 20
24static char *cpu_name(int level) 21static char *cpu_name(int level)
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee005..4d3ff037201f 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,21 +22,13 @@
22 22
23#ifdef _SETUP 23#ifdef _SETUP
24# include "boot.h" 24# include "boot.h"
25# include "bitops.h"
26#endif 25#endif
27#include <linux/types.h> 26#include <linux/types.h>
28#include <asm/cpufeature.h>
29#include <asm/processor-flags.h> 27#include <asm/processor-flags.h>
30#include <asm/required-features.h> 28#include <asm/required-features.h>
31#include <asm/msr-index.h> 29#include <asm/msr-index.h>
32 30
33struct cpu_features { 31struct cpu_features cpu;
34 int level; /* Family, or 64 for x86-64 */
35 int model;
36 u32 flags[NCAPINTS];
37};
38
39static struct cpu_features cpu;
40static u32 cpu_vendor[3]; 32static u32 cpu_vendor[3];
41static u32 err_flags[NCAPINTS]; 33static u32 err_flags[NCAPINTS];
42 34
@@ -46,12 +38,12 @@ static const u32 req_flags[NCAPINTS] =
46{ 38{
47 REQUIRED_MASK0, 39 REQUIRED_MASK0,
48 REQUIRED_MASK1, 40 REQUIRED_MASK1,
49 REQUIRED_MASK2, 41 0, /* REQUIRED_MASK2 not implemented in this file */
50 REQUIRED_MASK3, 42 0, /* REQUIRED_MASK3 not implemented in this file */
51 REQUIRED_MASK4, 43 REQUIRED_MASK4,
52 REQUIRED_MASK5, 44 0, /* REQUIRED_MASK5 not implemented in this file */
53 REQUIRED_MASK6, 45 REQUIRED_MASK6,
54 REQUIRED_MASK7, 46 0, /* REQUIRED_MASK7 not implemented in this file */
55}; 47};
56 48
57#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) 49#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 2296164b54d2..197421db1af1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,11 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6)
79 return;
80
76 asm("int $0x15" 81 asm("int $0x15"
77 : "=a" (boot_params.ist_info.signature), 82 : "=a" (boot_params.ist_info.signature),
78 "=b" (boot_params.ist_info.command), 83 "=b" (boot_params.ist_info.command),
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 53165c97336b..8c3c25f35578 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,7 +13,6 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/kernel.h>
17 16
18#define SMAP 0x534d4150 /* ASCII "SMAP" */ 17#define SMAP 0x534d4150 /* ASCII "SMAP" */
19 18
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index a7ae4385670e..8cc9eea839e4 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc4
4# Sun May 4 19:59:02 2008 4# Mon Aug 25 15:04:00 2008
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,27 +134,35 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147CONFIG_HAVE_GENERIC_DMA_COHERENT=y
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152# CONFIG_LBD is not set 161# CONFIG_LBD is not set
153CONFIG_BLK_DEV_IO_TRACE=y 162CONFIG_BLK_DEV_IO_TRACE=y
154# CONFIG_LSF is not set 163# CONFIG_LSF is not set
155CONFIG_BLK_DEV_BSG=y 164CONFIG_BLK_DEV_BSG=y
165# CONFIG_BLK_DEV_INTEGRITY is not set
156 166
157# 167#
158# IO Schedulers 168# IO Schedulers
@@ -176,19 +186,17 @@ CONFIG_NO_HZ=y
176CONFIG_HIGH_RES_TIMERS=y 186CONFIG_HIGH_RES_TIMERS=y
177CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 187CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
178CONFIG_SMP=y 188CONFIG_SMP=y
189CONFIG_X86_FIND_SMP_CONFIG=y
190CONFIG_X86_MPPARSE=y
179CONFIG_X86_PC=y 191CONFIG_X86_PC=y
180# CONFIG_X86_ELAN is not set 192# CONFIG_X86_ELAN is not set
181# CONFIG_X86_VOYAGER is not set 193# CONFIG_X86_VOYAGER is not set
182# CONFIG_X86_NUMAQ is not set
183# CONFIG_X86_SUMMIT is not set
184# CONFIG_X86_BIGSMP is not set
185# CONFIG_X86_VISWS is not set
186# CONFIG_X86_GENERICARCH is not set 194# CONFIG_X86_GENERICARCH is not set
187# CONFIG_X86_ES7000 is not set
188# CONFIG_X86_RDC321X is not set
189# CONFIG_X86_VSMP is not set 195# CONFIG_X86_VSMP is not set
196# CONFIG_X86_RDC321X is not set
190CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y 197CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
191# CONFIG_PARAVIRT_GUEST is not set 198# CONFIG_PARAVIRT_GUEST is not set
199# CONFIG_MEMTEST is not set
192# CONFIG_M386 is not set 200# CONFIG_M386 is not set
193# CONFIG_M486 is not set 201# CONFIG_M486 is not set
194# CONFIG_M586 is not set 202# CONFIG_M586 is not set
@@ -215,21 +223,19 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
215# CONFIG_MPSC is not set 223# CONFIG_MPSC is not set
216CONFIG_MCORE2=y 224CONFIG_MCORE2=y
217# CONFIG_GENERIC_CPU is not set 225# CONFIG_GENERIC_CPU is not set
218# CONFIG_X86_GENERIC is not set 226CONFIG_X86_GENERIC=y
219CONFIG_X86_CPU=y 227CONFIG_X86_CPU=y
220CONFIG_X86_CMPXCHG=y 228CONFIG_X86_CMPXCHG=y
221CONFIG_X86_L1_CACHE_SHIFT=6 229CONFIG_X86_L1_CACHE_SHIFT=7
222CONFIG_X86_XADD=y 230CONFIG_X86_XADD=y
223CONFIG_X86_WP_WORKS_OK=y 231CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INVLPG=y 232CONFIG_X86_INVLPG=y
225CONFIG_X86_BSWAP=y 233CONFIG_X86_BSWAP=y
226CONFIG_X86_POPAD_OK=y 234CONFIG_X86_POPAD_OK=y
227CONFIG_X86_GOOD_APIC=y
228CONFIG_X86_INTEL_USERCOPY=y 235CONFIG_X86_INTEL_USERCOPY=y
229CONFIG_X86_USE_PPRO_CHECKSUM=y 236CONFIG_X86_USE_PPRO_CHECKSUM=y
230CONFIG_X86_P6_NOP=y
231CONFIG_X86_TSC=y 237CONFIG_X86_TSC=y
232CONFIG_X86_MINIMUM_CPU_FAMILY=6 238CONFIG_X86_MINIMUM_CPU_FAMILY=4
233CONFIG_X86_DEBUGCTLMSR=y 239CONFIG_X86_DEBUGCTLMSR=y
234CONFIG_HPET_TIMER=y 240CONFIG_HPET_TIMER=y
235CONFIG_HPET_EMULATE_RTC=y 241CONFIG_HPET_EMULATE_RTC=y
@@ -247,7 +253,7 @@ CONFIG_X86_IO_APIC=y
247CONFIG_VM86=y 253CONFIG_VM86=y
248# CONFIG_TOSHIBA is not set 254# CONFIG_TOSHIBA is not set
249# CONFIG_I8K is not set 255# CONFIG_I8K is not set
250# CONFIG_X86_REBOOTFIXUPS is not set 256CONFIG_X86_REBOOTFIXUPS=y
251# CONFIG_MICROCODE is not set 257# CONFIG_MICROCODE is not set
252CONFIG_X86_MSR=y 258CONFIG_X86_MSR=y
253CONFIG_X86_CPUID=y 259CONFIG_X86_CPUID=y
@@ -256,32 +262,28 @@ CONFIG_HIGHMEM4G=y
256# CONFIG_HIGHMEM64G is not set 262# CONFIG_HIGHMEM64G is not set
257CONFIG_PAGE_OFFSET=0xC0000000 263CONFIG_PAGE_OFFSET=0xC0000000
258CONFIG_HIGHMEM=y 264CONFIG_HIGHMEM=y
259CONFIG_NEED_NODE_MEMMAP_SIZE=y
260CONFIG_ARCH_FLATMEM_ENABLE=y 265CONFIG_ARCH_FLATMEM_ENABLE=y
261CONFIG_ARCH_SPARSEMEM_ENABLE=y 266CONFIG_ARCH_SPARSEMEM_ENABLE=y
262CONFIG_ARCH_SELECT_MEMORY_MODEL=y 267CONFIG_ARCH_SELECT_MEMORY_MODEL=y
263CONFIG_SELECT_MEMORY_MODEL=y 268CONFIG_SELECT_MEMORY_MODEL=y
264# CONFIG_FLATMEM_MANUAL is not set 269CONFIG_FLATMEM_MANUAL=y
265# CONFIG_DISCONTIGMEM_MANUAL is not set 270# CONFIG_DISCONTIGMEM_MANUAL is not set
266CONFIG_SPARSEMEM_MANUAL=y 271# CONFIG_SPARSEMEM_MANUAL is not set
267CONFIG_SPARSEMEM=y 272CONFIG_FLATMEM=y
268CONFIG_HAVE_MEMORY_PRESENT=y 273CONFIG_FLAT_NODE_MEM_MAP=y
269CONFIG_SPARSEMEM_STATIC=y 274CONFIG_SPARSEMEM_STATIC=y
270# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set 275# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
271
272#
273# Memory hotplug is currently incompatible with Software Suspend
274#
275CONFIG_PAGEFLAGS_EXTENDED=y 276CONFIG_PAGEFLAGS_EXTENDED=y
276CONFIG_SPLIT_PTLOCK_CPUS=4 277CONFIG_SPLIT_PTLOCK_CPUS=4
277CONFIG_RESOURCES_64BIT=y 278CONFIG_RESOURCES_64BIT=y
278CONFIG_ZONE_DMA_FLAG=1 279CONFIG_ZONE_DMA_FLAG=1
279CONFIG_BOUNCE=y 280CONFIG_BOUNCE=y
280CONFIG_VIRT_TO_BUS=y 281CONFIG_VIRT_TO_BUS=y
281# CONFIG_HIGHPTE is not set 282CONFIG_HIGHPTE=y
282# CONFIG_MATH_EMULATION is not set 283# CONFIG_MATH_EMULATION is not set
283CONFIG_MTRR=y 284CONFIG_MTRR=y
284# CONFIG_X86_PAT is not set 285# CONFIG_MTRR_SANITIZER is not set
286CONFIG_X86_PAT=y
285CONFIG_EFI=y 287CONFIG_EFI=y
286# CONFIG_IRQBALANCE is not set 288# CONFIG_IRQBALANCE is not set
287CONFIG_SECCOMP=y 289CONFIG_SECCOMP=y
@@ -293,6 +295,7 @@ CONFIG_HZ=1000
293CONFIG_SCHED_HRTICK=y 295CONFIG_SCHED_HRTICK=y
294CONFIG_KEXEC=y 296CONFIG_KEXEC=y
295CONFIG_CRASH_DUMP=y 297CONFIG_CRASH_DUMP=y
298# CONFIG_KEXEC_JUMP is not set
296CONFIG_PHYSICAL_START=0x1000000 299CONFIG_PHYSICAL_START=0x1000000
297CONFIG_RELOCATABLE=y 300CONFIG_RELOCATABLE=y
298CONFIG_PHYSICAL_ALIGN=0x200000 301CONFIG_PHYSICAL_ALIGN=0x200000
@@ -312,6 +315,7 @@ CONFIG_PM_TRACE_RTC=y
312CONFIG_PM_SLEEP_SMP=y 315CONFIG_PM_SLEEP_SMP=y
313CONFIG_PM_SLEEP=y 316CONFIG_PM_SLEEP=y
314CONFIG_SUSPEND=y 317CONFIG_SUSPEND=y
318# CONFIG_PM_TEST_SUSPEND is not set
315CONFIG_SUSPEND_FREEZER=y 319CONFIG_SUSPEND_FREEZER=y
316CONFIG_HIBERNATION=y 320CONFIG_HIBERNATION=y
317CONFIG_PM_STD_PARTITION="" 321CONFIG_PM_STD_PARTITION=""
@@ -337,6 +341,7 @@ CONFIG_ACPI_THERMAL=y
337CONFIG_ACPI_BLACKLIST_YEAR=0 341CONFIG_ACPI_BLACKLIST_YEAR=0
338# CONFIG_ACPI_DEBUG is not set 342# CONFIG_ACPI_DEBUG is not set
339CONFIG_ACPI_EC=y 343CONFIG_ACPI_EC=y
344# CONFIG_ACPI_PCI_SLOT is not set
340CONFIG_ACPI_POWER=y 345CONFIG_ACPI_POWER=y
341CONFIG_ACPI_SYSTEM=y 346CONFIG_ACPI_SYSTEM=y
342CONFIG_X86_PM_TIMER=y 347CONFIG_X86_PM_TIMER=y
@@ -395,8 +400,8 @@ CONFIG_PCI=y
395# CONFIG_PCI_GOBIOS is not set 400# CONFIG_PCI_GOBIOS is not set
396# CONFIG_PCI_GOMMCONFIG is not set 401# CONFIG_PCI_GOMMCONFIG is not set
397# CONFIG_PCI_GODIRECT is not set 402# CONFIG_PCI_GODIRECT is not set
398CONFIG_PCI_GOANY=y
399# CONFIG_PCI_GOOLPC is not set 403# CONFIG_PCI_GOOLPC is not set
404CONFIG_PCI_GOANY=y
400CONFIG_PCI_BIOS=y 405CONFIG_PCI_BIOS=y
401CONFIG_PCI_DIRECT=y 406CONFIG_PCI_DIRECT=y
402CONFIG_PCI_MMCONFIG=y 407CONFIG_PCI_MMCONFIG=y
@@ -448,10 +453,6 @@ CONFIG_HOTPLUG_PCI=y
448CONFIG_BINFMT_ELF=y 453CONFIG_BINFMT_ELF=y
449# CONFIG_BINFMT_AOUT is not set 454# CONFIG_BINFMT_AOUT is not set
450CONFIG_BINFMT_MISC=y 455CONFIG_BINFMT_MISC=y
451
452#
453# Networking
454#
455CONFIG_NET=y 456CONFIG_NET=y
456 457
457# 458#
@@ -475,7 +476,10 @@ CONFIG_IP_FIB_HASH=y
475CONFIG_IP_MULTIPLE_TABLES=y 476CONFIG_IP_MULTIPLE_TABLES=y
476CONFIG_IP_ROUTE_MULTIPATH=y 477CONFIG_IP_ROUTE_MULTIPATH=y
477CONFIG_IP_ROUTE_VERBOSE=y 478CONFIG_IP_ROUTE_VERBOSE=y
478# CONFIG_IP_PNP is not set 479CONFIG_IP_PNP=y
480CONFIG_IP_PNP_DHCP=y
481CONFIG_IP_PNP_BOOTP=y
482CONFIG_IP_PNP_RARP=y
479# CONFIG_NET_IPIP is not set 483# CONFIG_NET_IPIP is not set
480# CONFIG_NET_IPGRE is not set 484# CONFIG_NET_IPGRE is not set
481CONFIG_IP_MROUTE=y 485CONFIG_IP_MROUTE=y
@@ -618,7 +622,6 @@ CONFIG_NET_SCHED=y
618# CONFIG_NET_SCH_HTB is not set 622# CONFIG_NET_SCH_HTB is not set
619# CONFIG_NET_SCH_HFSC is not set 623# CONFIG_NET_SCH_HFSC is not set
620# CONFIG_NET_SCH_PRIO is not set 624# CONFIG_NET_SCH_PRIO is not set
621# CONFIG_NET_SCH_RR is not set
622# CONFIG_NET_SCH_RED is not set 625# CONFIG_NET_SCH_RED is not set
623# CONFIG_NET_SCH_SFQ is not set 626# CONFIG_NET_SCH_SFQ is not set
624# CONFIG_NET_SCH_TEQL is not set 627# CONFIG_NET_SCH_TEQL is not set
@@ -680,28 +683,19 @@ CONFIG_FIB_RULES=y
680CONFIG_CFG80211=y 683CONFIG_CFG80211=y
681CONFIG_NL80211=y 684CONFIG_NL80211=y
682CONFIG_WIRELESS_EXT=y 685CONFIG_WIRELESS_EXT=y
686CONFIG_WIRELESS_EXT_SYSFS=y
683CONFIG_MAC80211=y 687CONFIG_MAC80211=y
684 688
685# 689#
686# Rate control algorithm selection 690# Rate control algorithm selection
687# 691#
692CONFIG_MAC80211_RC_PID=y
688CONFIG_MAC80211_RC_DEFAULT_PID=y 693CONFIG_MAC80211_RC_DEFAULT_PID=y
689# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
690
691#
692# Selecting 'y' for an algorithm will
693#
694
695#
696# build the algorithm into mac80211.
697#
698CONFIG_MAC80211_RC_DEFAULT="pid" 694CONFIG_MAC80211_RC_DEFAULT="pid"
699CONFIG_MAC80211_RC_PID=y
700# CONFIG_MAC80211_MESH is not set 695# CONFIG_MAC80211_MESH is not set
701CONFIG_MAC80211_LEDS=y 696CONFIG_MAC80211_LEDS=y
702# CONFIG_MAC80211_DEBUGFS is not set 697# CONFIG_MAC80211_DEBUGFS is not set
703# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 698# CONFIG_MAC80211_DEBUG_MENU is not set
704# CONFIG_MAC80211_DEBUG is not set
705# CONFIG_IEEE80211 is not set 699# CONFIG_IEEE80211 is not set
706# CONFIG_RFKILL is not set 700# CONFIG_RFKILL is not set
707# CONFIG_NET_9P is not set 701# CONFIG_NET_9P is not set
@@ -717,6 +711,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
717CONFIG_STANDALONE=y 711CONFIG_STANDALONE=y
718CONFIG_PREVENT_FIRMWARE_BUILD=y 712CONFIG_PREVENT_FIRMWARE_BUILD=y
719CONFIG_FW_LOADER=y 713CONFIG_FW_LOADER=y
714CONFIG_FIRMWARE_IN_KERNEL=y
715CONFIG_EXTRA_FIRMWARE=""
720# CONFIG_DEBUG_DRIVER is not set 716# CONFIG_DEBUG_DRIVER is not set
721CONFIG_DEBUG_DEVRES=y 717CONFIG_DEBUG_DEVRES=y
722# CONFIG_SYS_HYPERVISOR is not set 718# CONFIG_SYS_HYPERVISOR is not set
@@ -749,6 +745,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
749# CONFIG_BLK_DEV_XIP is not set 745# CONFIG_BLK_DEV_XIP is not set
750# CONFIG_CDROM_PKTCDVD is not set 746# CONFIG_CDROM_PKTCDVD is not set
751# CONFIG_ATA_OVER_ETH is not set 747# CONFIG_ATA_OVER_ETH is not set
748# CONFIG_BLK_DEV_HD is not set
752CONFIG_MISC_DEVICES=y 749CONFIG_MISC_DEVICES=y
753# CONFIG_IBM_ASM is not set 750# CONFIG_IBM_ASM is not set
754# CONFIG_PHANTOM is not set 751# CONFIG_PHANTOM is not set
@@ -760,10 +757,12 @@ CONFIG_MISC_DEVICES=y
760# CONFIG_FUJITSU_LAPTOP is not set 757# CONFIG_FUJITSU_LAPTOP is not set
761# CONFIG_TC1100_WMI is not set 758# CONFIG_TC1100_WMI is not set
762# CONFIG_MSI_LAPTOP is not set 759# CONFIG_MSI_LAPTOP is not set
760# CONFIG_COMPAL_LAPTOP is not set
763# CONFIG_SONY_LAPTOP is not set 761# CONFIG_SONY_LAPTOP is not set
764# CONFIG_THINKPAD_ACPI is not set 762# CONFIG_THINKPAD_ACPI is not set
765# CONFIG_INTEL_MENLOW is not set 763# CONFIG_INTEL_MENLOW is not set
766# CONFIG_ENCLOSURE_SERVICES is not set 764# CONFIG_ENCLOSURE_SERVICES is not set
765# CONFIG_HP_ILO is not set
767CONFIG_HAVE_IDE=y 766CONFIG_HAVE_IDE=y
768# CONFIG_IDE is not set 767# CONFIG_IDE is not set
769 768
@@ -802,12 +801,13 @@ CONFIG_SCSI_WAIT_SCAN=m
802# 801#
803CONFIG_SCSI_SPI_ATTRS=y 802CONFIG_SCSI_SPI_ATTRS=y
804# CONFIG_SCSI_FC_ATTRS is not set 803# CONFIG_SCSI_FC_ATTRS is not set
805# CONFIG_SCSI_ISCSI_ATTRS is not set 804CONFIG_SCSI_ISCSI_ATTRS=y
806# CONFIG_SCSI_SAS_ATTRS is not set 805# CONFIG_SCSI_SAS_ATTRS is not set
807# CONFIG_SCSI_SAS_LIBSAS is not set 806# CONFIG_SCSI_SAS_LIBSAS is not set
808# CONFIG_SCSI_SRP_ATTRS is not set 807# CONFIG_SCSI_SRP_ATTRS is not set
809# CONFIG_SCSI_LOWLEVEL is not set 808# CONFIG_SCSI_LOWLEVEL is not set
810# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 809# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
810# CONFIG_SCSI_DH is not set
811CONFIG_ATA=y 811CONFIG_ATA=y
812# CONFIG_ATA_NONSTANDARD is not set 812# CONFIG_ATA_NONSTANDARD is not set
813CONFIG_ATA_ACPI=y 813CONFIG_ATA_ACPI=y
@@ -842,7 +842,7 @@ CONFIG_PATA_AMD=y
842# CONFIG_PATA_CS5536 is not set 842# CONFIG_PATA_CS5536 is not set
843# CONFIG_PATA_CYPRESS is not set 843# CONFIG_PATA_CYPRESS is not set
844# CONFIG_PATA_EFAR is not set 844# CONFIG_PATA_EFAR is not set
845# CONFIG_ATA_GENERIC is not set 845CONFIG_ATA_GENERIC=y
846# CONFIG_PATA_HPT366 is not set 846# CONFIG_PATA_HPT366 is not set
847# CONFIG_PATA_HPT37X is not set 847# CONFIG_PATA_HPT37X is not set
848# CONFIG_PATA_HPT3X2N is not set 848# CONFIG_PATA_HPT3X2N is not set
@@ -852,7 +852,7 @@ CONFIG_PATA_AMD=y
852# CONFIG_PATA_JMICRON is not set 852# CONFIG_PATA_JMICRON is not set
853# CONFIG_PATA_TRIFLEX is not set 853# CONFIG_PATA_TRIFLEX is not set
854# CONFIG_PATA_MARVELL is not set 854# CONFIG_PATA_MARVELL is not set
855# CONFIG_PATA_MPIIX is not set 855CONFIG_PATA_MPIIX=y
856CONFIG_PATA_OLDPIIX=y 856CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_NETCELL is not set 857# CONFIG_PATA_NETCELL is not set
858# CONFIG_PATA_NINJA32 is not set 858# CONFIG_PATA_NINJA32 is not set
@@ -871,6 +871,7 @@ CONFIG_PATA_OLDPIIX=y
871# CONFIG_PATA_SIS is not set 871# CONFIG_PATA_SIS is not set
872# CONFIG_PATA_VIA is not set 872# CONFIG_PATA_VIA is not set
873# CONFIG_PATA_WINBOND is not set 873# CONFIG_PATA_WINBOND is not set
874CONFIG_PATA_SCH=y
874CONFIG_MD=y 875CONFIG_MD=y
875CONFIG_BLK_DEV_MD=y 876CONFIG_BLK_DEV_MD=y
876# CONFIG_MD_LINEAR is not set 877# CONFIG_MD_LINEAR is not set
@@ -894,13 +895,16 @@ CONFIG_DM_ZERO=y
894# 895#
895# IEEE 1394 (FireWire) support 896# IEEE 1394 (FireWire) support
896# 897#
898
899#
900# Enable only one of the two stacks, unless you know what you are doing
901#
897# CONFIG_FIREWIRE is not set 902# CONFIG_FIREWIRE is not set
898# CONFIG_IEEE1394 is not set 903# CONFIG_IEEE1394 is not set
899# CONFIG_I2O is not set 904# CONFIG_I2O is not set
900CONFIG_MACINTOSH_DRIVERS=y 905CONFIG_MACINTOSH_DRIVERS=y
901CONFIG_MAC_EMUMOUSEBTN=y 906CONFIG_MAC_EMUMOUSEBTN=y
902CONFIG_NETDEVICES=y 907CONFIG_NETDEVICES=y
903# CONFIG_NETDEVICES_MULTIQUEUE is not set
904# CONFIG_IFB is not set 908# CONFIG_IFB is not set
905# CONFIG_DUMMY is not set 909# CONFIG_DUMMY is not set
906# CONFIG_BONDING is not set 910# CONFIG_BONDING is not set
@@ -910,7 +914,23 @@ CONFIG_NETDEVICES=y
910# CONFIG_VETH is not set 914# CONFIG_VETH is not set
911# CONFIG_NET_SB1000 is not set 915# CONFIG_NET_SB1000 is not set
912# CONFIG_ARCNET is not set 916# CONFIG_ARCNET is not set
913# CONFIG_PHYLIB is not set 917CONFIG_PHYLIB=y
918
919#
920# MII PHY device drivers
921#
922# CONFIG_MARVELL_PHY is not set
923# CONFIG_DAVICOM_PHY is not set
924# CONFIG_QSEMI_PHY is not set
925# CONFIG_LXT_PHY is not set
926# CONFIG_CICADA_PHY is not set
927# CONFIG_VITESSE_PHY is not set
928# CONFIG_SMSC_PHY is not set
929# CONFIG_BROADCOM_PHY is not set
930# CONFIG_ICPLUS_PHY is not set
931# CONFIG_REALTEK_PHY is not set
932# CONFIG_FIXED_PHY is not set
933# CONFIG_MDIO_BITBANG is not set
914CONFIG_NET_ETHERNET=y 934CONFIG_NET_ETHERNET=y
915CONFIG_MII=y 935CONFIG_MII=y
916# CONFIG_HAPPYMEAL is not set 936# CONFIG_HAPPYMEAL is not set
@@ -943,10 +963,10 @@ CONFIG_FORCEDETH=y
943CONFIG_E100=y 963CONFIG_E100=y
944# CONFIG_FEALNX is not set 964# CONFIG_FEALNX is not set
945# CONFIG_NATSEMI is not set 965# CONFIG_NATSEMI is not set
946# CONFIG_NE2K_PCI is not set 966CONFIG_NE2K_PCI=y
947# CONFIG_8139CP is not set 967# CONFIG_8139CP is not set
948CONFIG_8139TOO=y 968CONFIG_8139TOO=y
949CONFIG_8139TOO_PIO=y 969# CONFIG_8139TOO_PIO is not set
950# CONFIG_8139TOO_TUNE_TWISTER is not set 970# CONFIG_8139TOO_TUNE_TWISTER is not set
951# CONFIG_8139TOO_8129 is not set 971# CONFIG_8139TOO_8129 is not set
952# CONFIG_8139_OLD_RX_RESET is not set 972# CONFIG_8139_OLD_RX_RESET is not set
@@ -961,25 +981,24 @@ CONFIG_NETDEV_1000=y
961# CONFIG_ACENIC is not set 981# CONFIG_ACENIC is not set
962# CONFIG_DL2K is not set 982# CONFIG_DL2K is not set
963CONFIG_E1000=y 983CONFIG_E1000=y
964# CONFIG_E1000_NAPI is not set
965# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 984# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
966# CONFIG_E1000E is not set 985CONFIG_E1000E=y
967# CONFIG_E1000E_ENABLED is not set
968# CONFIG_IP1000 is not set 986# CONFIG_IP1000 is not set
969# CONFIG_IGB is not set 987# CONFIG_IGB is not set
970# CONFIG_NS83820 is not set 988# CONFIG_NS83820 is not set
971# CONFIG_HAMACHI is not set 989# CONFIG_HAMACHI is not set
972# CONFIG_YELLOWFIN is not set 990# CONFIG_YELLOWFIN is not set
973# CONFIG_R8169 is not set 991CONFIG_R8169=y
974# CONFIG_SIS190 is not set 992# CONFIG_SIS190 is not set
975# CONFIG_SKGE is not set 993# CONFIG_SKGE is not set
976CONFIG_SKY2=y 994CONFIG_SKY2=y
977# CONFIG_SKY2_DEBUG is not set 995# CONFIG_SKY2_DEBUG is not set
978# CONFIG_VIA_VELOCITY is not set 996# CONFIG_VIA_VELOCITY is not set
979CONFIG_TIGON3=y 997CONFIG_TIGON3=y
980# CONFIG_BNX2 is not set 998CONFIG_BNX2=y
981# CONFIG_QLA3XXX is not set 999# CONFIG_QLA3XXX is not set
982# CONFIG_ATL1 is not set 1000# CONFIG_ATL1 is not set
1001# CONFIG_ATL1E is not set
983CONFIG_NETDEV_10000=y 1002CONFIG_NETDEV_10000=y
984# CONFIG_CHELSIO_T1 is not set 1003# CONFIG_CHELSIO_T1 is not set
985# CONFIG_CHELSIO_T3 is not set 1004# CONFIG_CHELSIO_T3 is not set
@@ -1019,13 +1038,14 @@ CONFIG_WLAN_80211=y
1019# CONFIG_RTL8180 is not set 1038# CONFIG_RTL8180 is not set
1020# CONFIG_RTL8187 is not set 1039# CONFIG_RTL8187 is not set
1021# CONFIG_ADM8211 is not set 1040# CONFIG_ADM8211 is not set
1041# CONFIG_MAC80211_HWSIM is not set
1022# CONFIG_P54_COMMON is not set 1042# CONFIG_P54_COMMON is not set
1023CONFIG_ATH5K=y 1043CONFIG_ATH5K=y
1024# CONFIG_ATH5K_DEBUG is not set 1044# CONFIG_ATH5K_DEBUG is not set
1025# CONFIG_IWLWIFI is not set 1045# CONFIG_ATH9K is not set
1026# CONFIG_IWLCORE is not set 1046# CONFIG_IWLCORE is not set
1027# CONFIG_IWLWIFI_LEDS is not set 1047# CONFIG_IWLWIFI_LEDS is not set
1028# CONFIG_IWL4965 is not set 1048# CONFIG_IWLAGN is not set
1029# CONFIG_IWL3945 is not set 1049# CONFIG_IWL3945 is not set
1030# CONFIG_HOSTAP is not set 1050# CONFIG_HOSTAP is not set
1031# CONFIG_B43 is not set 1051# CONFIG_B43 is not set
@@ -1105,6 +1125,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1105# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1125# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1106# CONFIG_MOUSE_SERIAL is not set 1126# CONFIG_MOUSE_SERIAL is not set
1107# CONFIG_MOUSE_APPLETOUCH is not set 1127# CONFIG_MOUSE_APPLETOUCH is not set
1128# CONFIG_MOUSE_BCM5974 is not set
1108# CONFIG_MOUSE_VSXXXAA is not set 1129# CONFIG_MOUSE_VSXXXAA is not set
1109CONFIG_INPUT_JOYSTICK=y 1130CONFIG_INPUT_JOYSTICK=y
1110# CONFIG_JOYSTICK_ANALOG is not set 1131# CONFIG_JOYSTICK_ANALOG is not set
@@ -1139,12 +1160,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1139# CONFIG_TOUCHSCREEN_GUNZE is not set 1160# CONFIG_TOUCHSCREEN_GUNZE is not set
1140# CONFIG_TOUCHSCREEN_ELO is not set 1161# CONFIG_TOUCHSCREEN_ELO is not set
1141# CONFIG_TOUCHSCREEN_MTOUCH is not set 1162# CONFIG_TOUCHSCREEN_MTOUCH is not set
1163# CONFIG_TOUCHSCREEN_INEXIO is not set
1142# CONFIG_TOUCHSCREEN_MK712 is not set 1164# CONFIG_TOUCHSCREEN_MK712 is not set
1143# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1165# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1144# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1166# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1145# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1167# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1146# CONFIG_TOUCHSCREEN_UCB1400 is not set 1168# CONFIG_TOUCHSCREEN_UCB1400 is not set
1147# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1169# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1170# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1148CONFIG_INPUT_MISC=y 1171CONFIG_INPUT_MISC=y
1149# CONFIG_INPUT_PCSPKR is not set 1172# CONFIG_INPUT_PCSPKR is not set
1150# CONFIG_INPUT_APANEL is not set 1173# CONFIG_INPUT_APANEL is not set
@@ -1173,6 +1196,7 @@ CONFIG_SERIO_LIBPS2=y
1173# Character devices 1196# Character devices
1174# 1197#
1175CONFIG_VT=y 1198CONFIG_VT=y
1199CONFIG_CONSOLE_TRANSLATIONS=y
1176CONFIG_VT_CONSOLE=y 1200CONFIG_VT_CONSOLE=y
1177CONFIG_HW_CONSOLE=y 1201CONFIG_HW_CONSOLE=y
1178CONFIG_VT_HW_CONSOLE_BINDING=y 1202CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1223,8 +1247,8 @@ CONFIG_UNIX98_PTYS=y
1223# CONFIG_LEGACY_PTYS is not set 1247# CONFIG_LEGACY_PTYS is not set
1224# CONFIG_IPMI_HANDLER is not set 1248# CONFIG_IPMI_HANDLER is not set
1225CONFIG_HW_RANDOM=y 1249CONFIG_HW_RANDOM=y
1226# CONFIG_HW_RANDOM_INTEL is not set 1250CONFIG_HW_RANDOM_INTEL=y
1227# CONFIG_HW_RANDOM_AMD is not set 1251CONFIG_HW_RANDOM_AMD=y
1228CONFIG_HW_RANDOM_GEODE=y 1252CONFIG_HW_RANDOM_GEODE=y
1229CONFIG_HW_RANDOM_VIA=y 1253CONFIG_HW_RANDOM_VIA=y
1230CONFIG_NVRAM=y 1254CONFIG_NVRAM=y
@@ -1245,7 +1269,6 @@ CONFIG_NVRAM=y
1245# CONFIG_CS5535_GPIO is not set 1269# CONFIG_CS5535_GPIO is not set
1246# CONFIG_RAW_DRIVER is not set 1270# CONFIG_RAW_DRIVER is not set
1247CONFIG_HPET=y 1271CONFIG_HPET=y
1248# CONFIG_HPET_RTC_IRQ is not set
1249# CONFIG_HPET_MMAP is not set 1272# CONFIG_HPET_MMAP is not set
1250# CONFIG_HANGCHECK_TIMER is not set 1273# CONFIG_HANGCHECK_TIMER is not set
1251# CONFIG_TCG_TPM is not set 1274# CONFIG_TCG_TPM is not set
@@ -1254,43 +1277,64 @@ CONFIG_DEVPORT=y
1254CONFIG_I2C=y 1277CONFIG_I2C=y
1255CONFIG_I2C_BOARDINFO=y 1278CONFIG_I2C_BOARDINFO=y
1256# CONFIG_I2C_CHARDEV is not set 1279# CONFIG_I2C_CHARDEV is not set
1280CONFIG_I2C_HELPER_AUTO=y
1257 1281
1258# 1282#
1259# I2C Hardware Bus support 1283# I2C Hardware Bus support
1260# 1284#
1285
1286#
1287# PC SMBus host controller drivers
1288#
1261# CONFIG_I2C_ALI1535 is not set 1289# CONFIG_I2C_ALI1535 is not set
1262# CONFIG_I2C_ALI1563 is not set 1290# CONFIG_I2C_ALI1563 is not set
1263# CONFIG_I2C_ALI15X3 is not set 1291# CONFIG_I2C_ALI15X3 is not set
1264# CONFIG_I2C_AMD756 is not set 1292# CONFIG_I2C_AMD756 is not set
1265# CONFIG_I2C_AMD8111 is not set 1293# CONFIG_I2C_AMD8111 is not set
1266CONFIG_I2C_I801=y 1294CONFIG_I2C_I801=y
1267# CONFIG_I2C_I810 is not set 1295# CONFIG_I2C_ISCH is not set
1268# CONFIG_I2C_PIIX4 is not set 1296# CONFIG_I2C_PIIX4 is not set
1269# CONFIG_I2C_NFORCE2 is not set 1297# CONFIG_I2C_NFORCE2 is not set
1270# CONFIG_I2C_OCORES is not set
1271# CONFIG_I2C_PARPORT_LIGHT is not set
1272# CONFIG_I2C_PROSAVAGE is not set
1273# CONFIG_I2C_SAVAGE4 is not set
1274# CONFIG_I2C_SIMTEC is not set
1275# CONFIG_SCx200_ACB is not set
1276# CONFIG_I2C_SIS5595 is not set 1298# CONFIG_I2C_SIS5595 is not set
1277# CONFIG_I2C_SIS630 is not set 1299# CONFIG_I2C_SIS630 is not set
1278# CONFIG_I2C_SIS96X is not set 1300# CONFIG_I2C_SIS96X is not set
1279# CONFIG_I2C_TAOS_EVM is not set
1280# CONFIG_I2C_STUB is not set
1281# CONFIG_I2C_TINY_USB is not set
1282# CONFIG_I2C_VIA is not set 1301# CONFIG_I2C_VIA is not set
1283# CONFIG_I2C_VIAPRO is not set 1302# CONFIG_I2C_VIAPRO is not set
1303
1304#
1305# I2C system bus drivers (mostly embedded / system-on-chip)
1306#
1307# CONFIG_I2C_OCORES is not set
1308# CONFIG_I2C_SIMTEC is not set
1309
1310#
1311# External I2C/SMBus adapter drivers
1312#
1313# CONFIG_I2C_PARPORT_LIGHT is not set
1314# CONFIG_I2C_TAOS_EVM is not set
1315# CONFIG_I2C_TINY_USB is not set
1316
1317#
1318# Graphics adapter I2C/DDC channel drivers
1319#
1284# CONFIG_I2C_VOODOO3 is not set 1320# CONFIG_I2C_VOODOO3 is not set
1321
1322#
1323# Other I2C/SMBus bus drivers
1324#
1285# CONFIG_I2C_PCA_PLATFORM is not set 1325# CONFIG_I2C_PCA_PLATFORM is not set
1326# CONFIG_I2C_STUB is not set
1327# CONFIG_SCx200_ACB is not set
1286 1328
1287# 1329#
1288# Miscellaneous I2C Chip support 1330# Miscellaneous I2C Chip support
1289# 1331#
1290# CONFIG_DS1682 is not set 1332# CONFIG_DS1682 is not set
1333# CONFIG_AT24 is not set
1291# CONFIG_SENSORS_EEPROM is not set 1334# CONFIG_SENSORS_EEPROM is not set
1292# CONFIG_SENSORS_PCF8574 is not set 1335# CONFIG_SENSORS_PCF8574 is not set
1293# CONFIG_PCF8575 is not set 1336# CONFIG_PCF8575 is not set
1337# CONFIG_SENSORS_PCA9539 is not set
1294# CONFIG_SENSORS_PCF8591 is not set 1338# CONFIG_SENSORS_PCF8591 is not set
1295# CONFIG_SENSORS_MAX6875 is not set 1339# CONFIG_SENSORS_MAX6875 is not set
1296# CONFIG_SENSORS_TSL2550 is not set 1340# CONFIG_SENSORS_TSL2550 is not set
@@ -1299,6 +1343,8 @@ CONFIG_I2C_I801=y
1299# CONFIG_I2C_DEBUG_BUS is not set 1343# CONFIG_I2C_DEBUG_BUS is not set
1300# CONFIG_I2C_DEBUG_CHIP is not set 1344# CONFIG_I2C_DEBUG_CHIP is not set
1301# CONFIG_SPI is not set 1345# CONFIG_SPI is not set
1346CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1347# CONFIG_GPIOLIB is not set
1302# CONFIG_W1 is not set 1348# CONFIG_W1 is not set
1303CONFIG_POWER_SUPPLY=y 1349CONFIG_POWER_SUPPLY=y
1304# CONFIG_POWER_SUPPLY_DEBUG is not set 1350# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1360,8 +1406,10 @@ CONFIG_SSB_POSSIBLE=y
1360# 1406#
1361# Multifunction device drivers 1407# Multifunction device drivers
1362# 1408#
1409# CONFIG_MFD_CORE is not set
1363# CONFIG_MFD_SM501 is not set 1410# CONFIG_MFD_SM501 is not set
1364# CONFIG_HTC_PASIC3 is not set 1411# CONFIG_HTC_PASIC3 is not set
1412# CONFIG_MFD_TMIO is not set
1365 1413
1366# 1414#
1367# Multimedia devices 1415# Multimedia devices
@@ -1372,6 +1420,7 @@ CONFIG_SSB_POSSIBLE=y
1372# 1420#
1373# CONFIG_VIDEO_DEV is not set 1421# CONFIG_VIDEO_DEV is not set
1374# CONFIG_DVB_CORE is not set 1422# CONFIG_DVB_CORE is not set
1423# CONFIG_VIDEO_MEDIA is not set
1375 1424
1376# 1425#
1377# Multimedia drivers 1426# Multimedia drivers
@@ -1418,7 +1467,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1418# CONFIG_FB_SYS_IMAGEBLIT is not set 1467# CONFIG_FB_SYS_IMAGEBLIT is not set
1419# CONFIG_FB_FOREIGN_ENDIAN is not set 1468# CONFIG_FB_FOREIGN_ENDIAN is not set
1420# CONFIG_FB_SYS_FOPS is not set 1469# CONFIG_FB_SYS_FOPS is not set
1421CONFIG_FB_DEFERRED_IO=y
1422# CONFIG_FB_SVGALIB is not set 1470# CONFIG_FB_SVGALIB is not set
1423# CONFIG_FB_MACMODES is not set 1471# CONFIG_FB_MACMODES is not set
1424# CONFIG_FB_BACKLIGHT is not set 1472# CONFIG_FB_BACKLIGHT is not set
@@ -1463,6 +1511,7 @@ CONFIG_FB_EFI=y
1463# CONFIG_FB_TRIDENT is not set 1511# CONFIG_FB_TRIDENT is not set
1464# CONFIG_FB_ARK is not set 1512# CONFIG_FB_ARK is not set
1465# CONFIG_FB_PM3 is not set 1513# CONFIG_FB_PM3 is not set
1514# CONFIG_FB_CARMINE is not set
1466# CONFIG_FB_GEODE is not set 1515# CONFIG_FB_GEODE is not set
1467# CONFIG_FB_VIRTUAL is not set 1516# CONFIG_FB_VIRTUAL is not set
1468CONFIG_BACKLIGHT_LCD_SUPPORT=y 1517CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1470,6 +1519,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1470CONFIG_BACKLIGHT_CLASS_DEVICE=y 1519CONFIG_BACKLIGHT_CLASS_DEVICE=y
1471# CONFIG_BACKLIGHT_CORGI is not set 1520# CONFIG_BACKLIGHT_CORGI is not set
1472# CONFIG_BACKLIGHT_PROGEAR is not set 1521# CONFIG_BACKLIGHT_PROGEAR is not set
1522# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1473 1523
1474# 1524#
1475# Display device support 1525# Display device support
@@ -1488,15 +1538,7 @@ CONFIG_LOGO=y
1488# CONFIG_LOGO_LINUX_MONO is not set 1538# CONFIG_LOGO_LINUX_MONO is not set
1489# CONFIG_LOGO_LINUX_VGA16 is not set 1539# CONFIG_LOGO_LINUX_VGA16 is not set
1490CONFIG_LOGO_LINUX_CLUT224=y 1540CONFIG_LOGO_LINUX_CLUT224=y
1491
1492#
1493# Sound
1494#
1495CONFIG_SOUND=y 1541CONFIG_SOUND=y
1496
1497#
1498# Advanced Linux Sound Architecture
1499#
1500CONFIG_SND=y 1542CONFIG_SND=y
1501CONFIG_SND_TIMER=y 1543CONFIG_SND_TIMER=y
1502CONFIG_SND_PCM=y 1544CONFIG_SND_PCM=y
@@ -1514,20 +1556,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1514# CONFIG_SND_VERBOSE_PRINTK is not set 1556# CONFIG_SND_VERBOSE_PRINTK is not set
1515# CONFIG_SND_DEBUG is not set 1557# CONFIG_SND_DEBUG is not set
1516CONFIG_SND_VMASTER=y 1558CONFIG_SND_VMASTER=y
1517 1559CONFIG_SND_DRIVERS=y
1518#
1519# Generic devices
1520#
1521# CONFIG_SND_PCSP is not set 1560# CONFIG_SND_PCSP is not set
1522# CONFIG_SND_DUMMY is not set 1561# CONFIG_SND_DUMMY is not set
1523# CONFIG_SND_VIRMIDI is not set 1562# CONFIG_SND_VIRMIDI is not set
1524# CONFIG_SND_MTPAV is not set 1563# CONFIG_SND_MTPAV is not set
1525# CONFIG_SND_SERIAL_U16550 is not set 1564# CONFIG_SND_SERIAL_U16550 is not set
1526# CONFIG_SND_MPU401 is not set 1565# CONFIG_SND_MPU401 is not set
1527 1566CONFIG_SND_PCI=y
1528#
1529# PCI devices
1530#
1531# CONFIG_SND_AD1889 is not set 1567# CONFIG_SND_AD1889 is not set
1532# CONFIG_SND_ALS300 is not set 1568# CONFIG_SND_ALS300 is not set
1533# CONFIG_SND_ALS4000 is not set 1569# CONFIG_SND_ALS4000 is not set
@@ -1602,36 +1638,14 @@ CONFIG_SND_HDA_GENERIC=y
1602# CONFIG_SND_VIRTUOSO is not set 1638# CONFIG_SND_VIRTUOSO is not set
1603# CONFIG_SND_VX222 is not set 1639# CONFIG_SND_VX222 is not set
1604# CONFIG_SND_YMFPCI is not set 1640# CONFIG_SND_YMFPCI is not set
1605 1641CONFIG_SND_USB=y
1606#
1607# USB devices
1608#
1609# CONFIG_SND_USB_AUDIO is not set 1642# CONFIG_SND_USB_AUDIO is not set
1610# CONFIG_SND_USB_USX2Y is not set 1643# CONFIG_SND_USB_USX2Y is not set
1611# CONFIG_SND_USB_CAIAQ is not set 1644# CONFIG_SND_USB_CAIAQ is not set
1612 1645CONFIG_SND_PCMCIA=y
1613#
1614# PCMCIA devices
1615#
1616# CONFIG_SND_VXPOCKET is not set 1646# CONFIG_SND_VXPOCKET is not set
1617# CONFIG_SND_PDAUDIOCF is not set 1647# CONFIG_SND_PDAUDIOCF is not set
1618
1619#
1620# System on Chip audio support
1621#
1622# CONFIG_SND_SOC is not set 1648# CONFIG_SND_SOC is not set
1623
1624#
1625# ALSA SoC audio for Freescale SOCs
1626#
1627
1628#
1629# SoC Audio for the Texas Instruments OMAP
1630#
1631
1632#
1633# Open Sound System
1634#
1635# CONFIG_SOUND_PRIME is not set 1649# CONFIG_SOUND_PRIME is not set
1636CONFIG_HID_SUPPORT=y 1650CONFIG_HID_SUPPORT=y
1637CONFIG_HID=y 1651CONFIG_HID=y
@@ -1667,6 +1681,7 @@ CONFIG_USB_DEVICEFS=y
1667# CONFIG_USB_DYNAMIC_MINORS is not set 1681# CONFIG_USB_DYNAMIC_MINORS is not set
1668CONFIG_USB_SUSPEND=y 1682CONFIG_USB_SUSPEND=y
1669# CONFIG_USB_OTG is not set 1683# CONFIG_USB_OTG is not set
1684CONFIG_USB_MON=y
1670 1685
1671# 1686#
1672# USB Host Controller Drivers 1687# USB Host Controller Drivers
@@ -1690,6 +1705,7 @@ CONFIG_USB_UHCI_HCD=y
1690# 1705#
1691# CONFIG_USB_ACM is not set 1706# CONFIG_USB_ACM is not set
1692CONFIG_USB_PRINTER=y 1707CONFIG_USB_PRINTER=y
1708# CONFIG_USB_WDM is not set
1693 1709
1694# 1710#
1695# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1711# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1711,6 +1727,7 @@ CONFIG_USB_STORAGE=y
1711# CONFIG_USB_STORAGE_ALAUDA is not set 1727# CONFIG_USB_STORAGE_ALAUDA is not set
1712# CONFIG_USB_STORAGE_ONETOUCH is not set 1728# CONFIG_USB_STORAGE_ONETOUCH is not set
1713# CONFIG_USB_STORAGE_KARMA is not set 1729# CONFIG_USB_STORAGE_KARMA is not set
1730# CONFIG_USB_STORAGE_SIERRA is not set
1714# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1731# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1715CONFIG_USB_LIBUSUAL=y 1732CONFIG_USB_LIBUSUAL=y
1716 1733
@@ -1719,7 +1736,6 @@ CONFIG_USB_LIBUSUAL=y
1719# 1736#
1720# CONFIG_USB_MDC800 is not set 1737# CONFIG_USB_MDC800 is not set
1721# CONFIG_USB_MICROTEK is not set 1738# CONFIG_USB_MICROTEK is not set
1722CONFIG_USB_MON=y
1723 1739
1724# 1740#
1725# USB port drivers 1741# USB port drivers
@@ -1732,7 +1748,6 @@ CONFIG_USB_MON=y
1732# CONFIG_USB_EMI62 is not set 1748# CONFIG_USB_EMI62 is not set
1733# CONFIG_USB_EMI26 is not set 1749# CONFIG_USB_EMI26 is not set
1734# CONFIG_USB_ADUTUX is not set 1750# CONFIG_USB_ADUTUX is not set
1735# CONFIG_USB_AUERSWALD is not set
1736# CONFIG_USB_RIO500 is not set 1751# CONFIG_USB_RIO500 is not set
1737# CONFIG_USB_LEGOTOWER is not set 1752# CONFIG_USB_LEGOTOWER is not set
1738# CONFIG_USB_LCD is not set 1753# CONFIG_USB_LCD is not set
@@ -1749,6 +1764,7 @@ CONFIG_USB_MON=y
1749# CONFIG_USB_TRANCEVIBRATOR is not set 1764# CONFIG_USB_TRANCEVIBRATOR is not set
1750# CONFIG_USB_IOWARRIOR is not set 1765# CONFIG_USB_IOWARRIOR is not set
1751# CONFIG_USB_TEST is not set 1766# CONFIG_USB_TEST is not set
1767# CONFIG_USB_ISIGHTFW is not set
1752# CONFIG_USB_GADGET is not set 1768# CONFIG_USB_GADGET is not set
1753# CONFIG_MMC is not set 1769# CONFIG_MMC is not set
1754# CONFIG_MEMSTICK is not set 1770# CONFIG_MEMSTICK is not set
@@ -1758,7 +1774,9 @@ CONFIG_LEDS_CLASS=y
1758# 1774#
1759# LED drivers 1775# LED drivers
1760# 1776#
1777# CONFIG_LEDS_PCA9532 is not set
1761# CONFIG_LEDS_CLEVO_MAIL is not set 1778# CONFIG_LEDS_CLEVO_MAIL is not set
1779# CONFIG_LEDS_PCA955X is not set
1762 1780
1763# 1781#
1764# LED Triggers 1782# LED Triggers
@@ -1804,6 +1822,7 @@ CONFIG_RTC_INTF_DEV=y
1804# CONFIG_RTC_DRV_PCF8583 is not set 1822# CONFIG_RTC_DRV_PCF8583 is not set
1805# CONFIG_RTC_DRV_M41T80 is not set 1823# CONFIG_RTC_DRV_M41T80 is not set
1806# CONFIG_RTC_DRV_S35390A is not set 1824# CONFIG_RTC_DRV_S35390A is not set
1825# CONFIG_RTC_DRV_FM3130 is not set
1807 1826
1808# 1827#
1809# SPI RTC drivers 1828# SPI RTC drivers
@@ -1836,11 +1855,13 @@ CONFIG_DMADEVICES=y
1836# Firmware Drivers 1855# Firmware Drivers
1837# 1856#
1838# CONFIG_EDD is not set 1857# CONFIG_EDD is not set
1858CONFIG_FIRMWARE_MEMMAP=y
1839CONFIG_EFI_VARS=y 1859CONFIG_EFI_VARS=y
1840# CONFIG_DELL_RBU is not set 1860# CONFIG_DELL_RBU is not set
1841# CONFIG_DCDBAS is not set 1861# CONFIG_DCDBAS is not set
1842CONFIG_DMIID=y 1862CONFIG_DMIID=y
1843# CONFIG_ISCSI_IBFT_FIND is not set 1863CONFIG_ISCSI_IBFT_FIND=y
1864CONFIG_ISCSI_IBFT=y
1844 1865
1845# 1866#
1846# File systems 1867# File systems
@@ -1919,14 +1940,27 @@ CONFIG_HUGETLB_PAGE=y
1919# CONFIG_CRAMFS is not set 1940# CONFIG_CRAMFS is not set
1920# CONFIG_VXFS_FS is not set 1941# CONFIG_VXFS_FS is not set
1921# CONFIG_MINIX_FS is not set 1942# CONFIG_MINIX_FS is not set
1943# CONFIG_OMFS_FS is not set
1922# CONFIG_HPFS_FS is not set 1944# CONFIG_HPFS_FS is not set
1923# CONFIG_QNX4FS_FS is not set 1945# CONFIG_QNX4FS_FS is not set
1924# CONFIG_ROMFS_FS is not set 1946# CONFIG_ROMFS_FS is not set
1925# CONFIG_SYSV_FS is not set 1947# CONFIG_SYSV_FS is not set
1926# CONFIG_UFS_FS is not set 1948# CONFIG_UFS_FS is not set
1927CONFIG_NETWORK_FILESYSTEMS=y 1949CONFIG_NETWORK_FILESYSTEMS=y
1928# CONFIG_NFS_FS is not set 1950CONFIG_NFS_FS=y
1951CONFIG_NFS_V3=y
1952CONFIG_NFS_V3_ACL=y
1953CONFIG_NFS_V4=y
1954CONFIG_ROOT_NFS=y
1929# CONFIG_NFSD is not set 1955# CONFIG_NFSD is not set
1956CONFIG_LOCKD=y
1957CONFIG_LOCKD_V4=y
1958CONFIG_NFS_ACL_SUPPORT=y
1959CONFIG_NFS_COMMON=y
1960CONFIG_SUNRPC=y
1961CONFIG_SUNRPC_GSS=y
1962CONFIG_RPCSEC_GSS_KRB5=y
1963# CONFIG_RPCSEC_GSS_SPKM3 is not set
1930# CONFIG_SMB_FS is not set 1964# CONFIG_SMB_FS is not set
1931# CONFIG_CIFS is not set 1965# CONFIG_CIFS is not set
1932# CONFIG_NCP_FS is not set 1966# CONFIG_NCP_FS is not set
@@ -2000,9 +2034,9 @@ CONFIG_NLS_UTF8=y
2000# Kernel hacking 2034# Kernel hacking
2001# 2035#
2002CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2036CONFIG_TRACE_IRQFLAGS_SUPPORT=y
2003# CONFIG_PRINTK_TIME is not set 2037CONFIG_PRINTK_TIME=y
2004# CONFIG_ENABLE_WARN_DEPRECATED is not set 2038CONFIG_ENABLE_WARN_DEPRECATED=y
2005# CONFIG_ENABLE_MUST_CHECK is not set 2039CONFIG_ENABLE_MUST_CHECK=y
2006CONFIG_FRAME_WARN=2048 2040CONFIG_FRAME_WARN=2048
2007CONFIG_MAGIC_SYSRQ=y 2041CONFIG_MAGIC_SYSRQ=y
2008# CONFIG_UNUSED_SYMBOLS is not set 2042# CONFIG_UNUSED_SYMBOLS is not set
@@ -2032,6 +2066,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
2032# CONFIG_DEBUG_INFO is not set 2066# CONFIG_DEBUG_INFO is not set
2033# CONFIG_DEBUG_VM is not set 2067# CONFIG_DEBUG_VM is not set
2034# CONFIG_DEBUG_WRITECOUNT is not set 2068# CONFIG_DEBUG_WRITECOUNT is not set
2069CONFIG_DEBUG_MEMORY_INIT=y
2035# CONFIG_DEBUG_LIST is not set 2070# CONFIG_DEBUG_LIST is not set
2036# CONFIG_DEBUG_SG is not set 2071# CONFIG_DEBUG_SG is not set
2037CONFIG_FRAME_POINTER=y 2072CONFIG_FRAME_POINTER=y
@@ -2042,23 +2077,32 @@ CONFIG_FRAME_POINTER=y
2042# CONFIG_LKDTM is not set 2077# CONFIG_LKDTM is not set
2043# CONFIG_FAULT_INJECTION is not set 2078# CONFIG_FAULT_INJECTION is not set
2044# CONFIG_LATENCYTOP is not set 2079# CONFIG_LATENCYTOP is not set
2080CONFIG_SYSCTL_SYSCALL_CHECK=y
2081CONFIG_HAVE_FTRACE=y
2082CONFIG_HAVE_DYNAMIC_FTRACE=y
2083# CONFIG_FTRACE is not set
2084# CONFIG_IRQSOFF_TRACER is not set
2085# CONFIG_SYSPROF_TRACER is not set
2086# CONFIG_SCHED_TRACER is not set
2087# CONFIG_CONTEXT_SWITCH_TRACER is not set
2045CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2088CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2046# CONFIG_SAMPLES is not set 2089# CONFIG_SAMPLES is not set
2047# CONFIG_KGDB is not set
2048CONFIG_HAVE_ARCH_KGDB=y 2090CONFIG_HAVE_ARCH_KGDB=y
2049# CONFIG_NONPROMISC_DEVMEM is not set 2091# CONFIG_KGDB is not set
2092# CONFIG_STRICT_DEVMEM is not set
2093CONFIG_X86_VERBOSE_BOOTUP=y
2050CONFIG_EARLY_PRINTK=y 2094CONFIG_EARLY_PRINTK=y
2051CONFIG_DEBUG_STACKOVERFLOW=y 2095CONFIG_DEBUG_STACKOVERFLOW=y
2052CONFIG_DEBUG_STACK_USAGE=y 2096CONFIG_DEBUG_STACK_USAGE=y
2053# CONFIG_DEBUG_PAGEALLOC is not set 2097# CONFIG_DEBUG_PAGEALLOC is not set
2098# CONFIG_DEBUG_PER_CPU_MAPS is not set
2054# CONFIG_X86_PTDUMP is not set 2099# CONFIG_X86_PTDUMP is not set
2055CONFIG_DEBUG_RODATA=y 2100CONFIG_DEBUG_RODATA=y
2056# CONFIG_DEBUG_RODATA_TEST is not set 2101# CONFIG_DEBUG_RODATA_TEST is not set
2057CONFIG_DEBUG_NX_TEST=m 2102CONFIG_DEBUG_NX_TEST=m
2058# CONFIG_4KSTACKS is not set 2103# CONFIG_4KSTACKS is not set
2059CONFIG_X86_FIND_SMP_CONFIG=y
2060CONFIG_X86_MPPARSE=y
2061CONFIG_DOUBLEFAULT=y 2104CONFIG_DOUBLEFAULT=y
2105# CONFIG_MMIOTRACE is not set
2062CONFIG_IO_DELAY_TYPE_0X80=0 2106CONFIG_IO_DELAY_TYPE_0X80=0
2063CONFIG_IO_DELAY_TYPE_0XED=1 2107CONFIG_IO_DELAY_TYPE_0XED=1
2064CONFIG_IO_DELAY_TYPE_UDELAY=2 2108CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2070,6 +2114,7 @@ CONFIG_IO_DELAY_0X80=y
2070CONFIG_DEFAULT_IO_DELAY_TYPE=0 2114CONFIG_DEFAULT_IO_DELAY_TYPE=0
2071CONFIG_DEBUG_BOOT_PARAMS=y 2115CONFIG_DEBUG_BOOT_PARAMS=y
2072# CONFIG_CPA_DEBUG is not set 2116# CONFIG_CPA_DEBUG is not set
2117# CONFIG_OPTIMIZE_INLINING is not set
2073 2118
2074# 2119#
2075# Security options 2120# Security options
@@ -2079,7 +2124,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2079CONFIG_SECURITY=y 2124CONFIG_SECURITY=y
2080CONFIG_SECURITY_NETWORK=y 2125CONFIG_SECURITY_NETWORK=y
2081# CONFIG_SECURITY_NETWORK_XFRM is not set 2126# CONFIG_SECURITY_NETWORK_XFRM is not set
2082CONFIG_SECURITY_CAPABILITIES=y
2083CONFIG_SECURITY_FILE_CAPABILITIES=y 2127CONFIG_SECURITY_FILE_CAPABILITIES=y
2084# CONFIG_SECURITY_ROOTPLUG is not set 2128# CONFIG_SECURITY_ROOTPLUG is not set
2085CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2129CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2140,6 +2184,10 @@ CONFIG_CRYPTO_HMAC=y
2140# CONFIG_CRYPTO_MD4 is not set 2184# CONFIG_CRYPTO_MD4 is not set
2141CONFIG_CRYPTO_MD5=y 2185CONFIG_CRYPTO_MD5=y
2142# CONFIG_CRYPTO_MICHAEL_MIC is not set 2186# CONFIG_CRYPTO_MICHAEL_MIC is not set
2187# CONFIG_CRYPTO_RMD128 is not set
2188# CONFIG_CRYPTO_RMD160 is not set
2189# CONFIG_CRYPTO_RMD256 is not set
2190# CONFIG_CRYPTO_RMD320 is not set
2143CONFIG_CRYPTO_SHA1=y 2191CONFIG_CRYPTO_SHA1=y
2144# CONFIG_CRYPTO_SHA256 is not set 2192# CONFIG_CRYPTO_SHA256 is not set
2145# CONFIG_CRYPTO_SHA512 is not set 2193# CONFIG_CRYPTO_SHA512 is not set
@@ -2150,7 +2198,7 @@ CONFIG_CRYPTO_SHA1=y
2150# Ciphers 2198# Ciphers
2151# 2199#
2152CONFIG_CRYPTO_AES=y 2200CONFIG_CRYPTO_AES=y
2153# CONFIG_CRYPTO_AES_586 is not set 2201CONFIG_CRYPTO_AES_586=y
2154# CONFIG_CRYPTO_ANUBIS is not set 2202# CONFIG_CRYPTO_ANUBIS is not set
2155CONFIG_CRYPTO_ARC4=y 2203CONFIG_CRYPTO_ARC4=y
2156# CONFIG_CRYPTO_BLOWFISH is not set 2204# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2192,6 +2240,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2192CONFIG_GENERIC_FIND_NEXT_BIT=y 2240CONFIG_GENERIC_FIND_NEXT_BIT=y
2193# CONFIG_CRC_CCITT is not set 2241# CONFIG_CRC_CCITT is not set
2194# CONFIG_CRC16 is not set 2242# CONFIG_CRC16 is not set
2243CONFIG_CRC_T10DIF=y
2195# CONFIG_CRC_ITU_T is not set 2244# CONFIG_CRC_ITU_T is not set
2196CONFIG_CRC32=y 2245CONFIG_CRC32=y
2197# CONFIG_CRC7 is not set 2246# CONFIG_CRC7 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 743e80392731..ab91941e5496 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc4
4# Sun May 4 19:59:57 2008 4# Mon Aug 25 14:40:46 2008
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,25 +134,33 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152CONFIG_BLK_DEV_IO_TRACE=y 161CONFIG_BLK_DEV_IO_TRACE=y
153CONFIG_BLK_DEV_BSG=y 162CONFIG_BLK_DEV_BSG=y
163# CONFIG_BLK_DEV_INTEGRITY is not set
154CONFIG_BLOCK_COMPAT=y 164CONFIG_BLOCK_COMPAT=y
155 165
156# 166#
@@ -175,20 +185,15 @@ CONFIG_NO_HZ=y
175CONFIG_HIGH_RES_TIMERS=y 185CONFIG_HIGH_RES_TIMERS=y
176CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 186CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
177CONFIG_SMP=y 187CONFIG_SMP=y
188CONFIG_X86_FIND_SMP_CONFIG=y
189CONFIG_X86_MPPARSE=y
178CONFIG_X86_PC=y 190CONFIG_X86_PC=y
179# CONFIG_X86_ELAN is not set 191# CONFIG_X86_ELAN is not set
180# CONFIG_X86_VOYAGER is not set 192# CONFIG_X86_VOYAGER is not set
181# CONFIG_X86_NUMAQ is not set
182# CONFIG_X86_SUMMIT is not set
183# CONFIG_X86_BIGSMP is not set
184# CONFIG_X86_VISWS is not set
185# CONFIG_X86_GENERICARCH is not set 193# CONFIG_X86_GENERICARCH is not set
186# CONFIG_X86_ES7000 is not set
187# CONFIG_X86_RDC321X is not set
188# CONFIG_X86_VSMP is not set 194# CONFIG_X86_VSMP is not set
189# CONFIG_PARAVIRT_GUEST is not set 195# CONFIG_PARAVIRT_GUEST is not set
190CONFIG_MEMTEST_BOOTPARAM=y 196# CONFIG_MEMTEST is not set
191CONFIG_MEMTEST_BOOTPARAM_VALUE=0
192# CONFIG_M386 is not set 197# CONFIG_M386 is not set
193# CONFIG_M486 is not set 198# CONFIG_M486 is not set
194# CONFIG_M586 is not set 199# CONFIG_M586 is not set
@@ -220,11 +225,12 @@ CONFIG_X86_L1_CACHE_BYTES=64
220CONFIG_X86_INTERNODE_CACHE_BYTES=64 225CONFIG_X86_INTERNODE_CACHE_BYTES=64
221CONFIG_X86_CMPXCHG=y 226CONFIG_X86_CMPXCHG=y
222CONFIG_X86_L1_CACHE_SHIFT=6 227CONFIG_X86_L1_CACHE_SHIFT=6
223CONFIG_X86_GOOD_APIC=y 228CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INTEL_USERCOPY=y 229CONFIG_X86_INTEL_USERCOPY=y
225CONFIG_X86_USE_PPRO_CHECKSUM=y 230CONFIG_X86_USE_PPRO_CHECKSUM=y
226CONFIG_X86_P6_NOP=y 231CONFIG_X86_P6_NOP=y
227CONFIG_X86_TSC=y 232CONFIG_X86_TSC=y
233CONFIG_X86_CMPXCHG64=y
228CONFIG_X86_CMOV=y 234CONFIG_X86_CMOV=y
229CONFIG_X86_MINIMUM_CPU_FAMILY=64 235CONFIG_X86_MINIMUM_CPU_FAMILY=64
230CONFIG_X86_DEBUGCTLMSR=y 236CONFIG_X86_DEBUGCTLMSR=y
@@ -234,8 +240,10 @@ CONFIG_DMI=y
234CONFIG_GART_IOMMU=y 240CONFIG_GART_IOMMU=y
235CONFIG_CALGARY_IOMMU=y 241CONFIG_CALGARY_IOMMU=y
236CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y 242CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
243CONFIG_AMD_IOMMU=y
237CONFIG_SWIOTLB=y 244CONFIG_SWIOTLB=y
238CONFIG_IOMMU_HELPER=y 245CONFIG_IOMMU_HELPER=y
246# CONFIG_MAXSMP is not set
239CONFIG_NR_CPUS=4 247CONFIG_NR_CPUS=4
240# CONFIG_SCHED_SMT is not set 248# CONFIG_SCHED_SMT is not set
241CONFIG_SCHED_MC=y 249CONFIG_SCHED_MC=y
@@ -281,6 +289,7 @@ CONFIG_ZONE_DMA_FLAG=1
281CONFIG_BOUNCE=y 289CONFIG_BOUNCE=y
282CONFIG_VIRT_TO_BUS=y 290CONFIG_VIRT_TO_BUS=y
283CONFIG_MTRR=y 291CONFIG_MTRR=y
292# CONFIG_MTRR_SANITIZER is not set
284# CONFIG_X86_PAT is not set 293# CONFIG_X86_PAT is not set
285CONFIG_EFI=y 294CONFIG_EFI=y
286CONFIG_SECCOMP=y 295CONFIG_SECCOMP=y
@@ -313,6 +322,7 @@ CONFIG_PM_TRACE_RTC=y
313CONFIG_PM_SLEEP_SMP=y 322CONFIG_PM_SLEEP_SMP=y
314CONFIG_PM_SLEEP=y 323CONFIG_PM_SLEEP=y
315CONFIG_SUSPEND=y 324CONFIG_SUSPEND=y
325# CONFIG_PM_TEST_SUSPEND is not set
316CONFIG_SUSPEND_FREEZER=y 326CONFIG_SUSPEND_FREEZER=y
317CONFIG_HIBERNATION=y 327CONFIG_HIBERNATION=y
318CONFIG_PM_STD_PARTITION="" 328CONFIG_PM_STD_PARTITION=""
@@ -339,6 +349,7 @@ CONFIG_ACPI_NUMA=y
339CONFIG_ACPI_BLACKLIST_YEAR=0 349CONFIG_ACPI_BLACKLIST_YEAR=0
340# CONFIG_ACPI_DEBUG is not set 350# CONFIG_ACPI_DEBUG is not set
341CONFIG_ACPI_EC=y 351CONFIG_ACPI_EC=y
352# CONFIG_ACPI_PCI_SLOT is not set
342CONFIG_ACPI_POWER=y 353CONFIG_ACPI_POWER=y
343CONFIG_ACPI_SYSTEM=y 354CONFIG_ACPI_SYSTEM=y
344CONFIG_X86_PM_TIMER=y 355CONFIG_X86_PM_TIMER=y
@@ -437,10 +448,6 @@ CONFIG_IA32_EMULATION=y
437CONFIG_COMPAT=y 448CONFIG_COMPAT=y
438CONFIG_COMPAT_FOR_U64_ALIGNMENT=y 449CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
439CONFIG_SYSVIPC_COMPAT=y 450CONFIG_SYSVIPC_COMPAT=y
440
441#
442# Networking
443#
444CONFIG_NET=y 451CONFIG_NET=y
445 452
446# 453#
@@ -464,7 +471,10 @@ CONFIG_IP_FIB_HASH=y
464CONFIG_IP_MULTIPLE_TABLES=y 471CONFIG_IP_MULTIPLE_TABLES=y
465CONFIG_IP_ROUTE_MULTIPATH=y 472CONFIG_IP_ROUTE_MULTIPATH=y
466CONFIG_IP_ROUTE_VERBOSE=y 473CONFIG_IP_ROUTE_VERBOSE=y
467# CONFIG_IP_PNP is not set 474CONFIG_IP_PNP=y
475CONFIG_IP_PNP_DHCP=y
476CONFIG_IP_PNP_BOOTP=y
477CONFIG_IP_PNP_RARP=y
468# CONFIG_NET_IPIP is not set 478# CONFIG_NET_IPIP is not set
469# CONFIG_NET_IPGRE is not set 479# CONFIG_NET_IPGRE is not set
470CONFIG_IP_MROUTE=y 480CONFIG_IP_MROUTE=y
@@ -607,7 +617,6 @@ CONFIG_NET_SCHED=y
607# CONFIG_NET_SCH_HTB is not set 617# CONFIG_NET_SCH_HTB is not set
608# CONFIG_NET_SCH_HFSC is not set 618# CONFIG_NET_SCH_HFSC is not set
609# CONFIG_NET_SCH_PRIO is not set 619# CONFIG_NET_SCH_PRIO is not set
610# CONFIG_NET_SCH_RR is not set
611# CONFIG_NET_SCH_RED is not set 620# CONFIG_NET_SCH_RED is not set
612# CONFIG_NET_SCH_SFQ is not set 621# CONFIG_NET_SCH_SFQ is not set
613# CONFIG_NET_SCH_TEQL is not set 622# CONFIG_NET_SCH_TEQL is not set
@@ -669,28 +678,19 @@ CONFIG_FIB_RULES=y
669CONFIG_CFG80211=y 678CONFIG_CFG80211=y
670CONFIG_NL80211=y 679CONFIG_NL80211=y
671CONFIG_WIRELESS_EXT=y 680CONFIG_WIRELESS_EXT=y
681CONFIG_WIRELESS_EXT_SYSFS=y
672CONFIG_MAC80211=y 682CONFIG_MAC80211=y
673 683
674# 684#
675# Rate control algorithm selection 685# Rate control algorithm selection
676# 686#
687CONFIG_MAC80211_RC_PID=y
677CONFIG_MAC80211_RC_DEFAULT_PID=y 688CONFIG_MAC80211_RC_DEFAULT_PID=y
678# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
679
680#
681# Selecting 'y' for an algorithm will
682#
683
684#
685# build the algorithm into mac80211.
686#
687CONFIG_MAC80211_RC_DEFAULT="pid" 689CONFIG_MAC80211_RC_DEFAULT="pid"
688CONFIG_MAC80211_RC_PID=y
689# CONFIG_MAC80211_MESH is not set 690# CONFIG_MAC80211_MESH is not set
690CONFIG_MAC80211_LEDS=y 691CONFIG_MAC80211_LEDS=y
691# CONFIG_MAC80211_DEBUGFS is not set 692# CONFIG_MAC80211_DEBUGFS is not set
692# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 693# CONFIG_MAC80211_DEBUG_MENU is not set
693# CONFIG_MAC80211_DEBUG is not set
694# CONFIG_IEEE80211 is not set 694# CONFIG_IEEE80211 is not set
695# CONFIG_RFKILL is not set 695# CONFIG_RFKILL is not set
696# CONFIG_NET_9P is not set 696# CONFIG_NET_9P is not set
@@ -706,6 +706,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
706CONFIG_STANDALONE=y 706CONFIG_STANDALONE=y
707CONFIG_PREVENT_FIRMWARE_BUILD=y 707CONFIG_PREVENT_FIRMWARE_BUILD=y
708CONFIG_FW_LOADER=y 708CONFIG_FW_LOADER=y
709CONFIG_FIRMWARE_IN_KERNEL=y
710CONFIG_EXTRA_FIRMWARE=""
709# CONFIG_DEBUG_DRIVER is not set 711# CONFIG_DEBUG_DRIVER is not set
710CONFIG_DEBUG_DEVRES=y 712CONFIG_DEBUG_DEVRES=y
711# CONFIG_SYS_HYPERVISOR is not set 713# CONFIG_SYS_HYPERVISOR is not set
@@ -738,6 +740,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
738# CONFIG_BLK_DEV_XIP is not set 740# CONFIG_BLK_DEV_XIP is not set
739# CONFIG_CDROM_PKTCDVD is not set 741# CONFIG_CDROM_PKTCDVD is not set
740# CONFIG_ATA_OVER_ETH is not set 742# CONFIG_ATA_OVER_ETH is not set
743# CONFIG_BLK_DEV_HD is not set
741CONFIG_MISC_DEVICES=y 744CONFIG_MISC_DEVICES=y
742# CONFIG_IBM_ASM is not set 745# CONFIG_IBM_ASM is not set
743# CONFIG_PHANTOM is not set 746# CONFIG_PHANTOM is not set
@@ -748,10 +751,14 @@ CONFIG_MISC_DEVICES=y
748# CONFIG_ASUS_LAPTOP is not set 751# CONFIG_ASUS_LAPTOP is not set
749# CONFIG_FUJITSU_LAPTOP is not set 752# CONFIG_FUJITSU_LAPTOP is not set
750# CONFIG_MSI_LAPTOP is not set 753# CONFIG_MSI_LAPTOP is not set
754# CONFIG_COMPAL_LAPTOP is not set
751# CONFIG_SONY_LAPTOP is not set 755# CONFIG_SONY_LAPTOP is not set
752# CONFIG_THINKPAD_ACPI is not set 756# CONFIG_THINKPAD_ACPI is not set
753# CONFIG_INTEL_MENLOW is not set 757# CONFIG_INTEL_MENLOW is not set
754# CONFIG_ENCLOSURE_SERVICES is not set 758# CONFIG_ENCLOSURE_SERVICES is not set
759# CONFIG_SGI_XP is not set
760# CONFIG_HP_ILO is not set
761# CONFIG_SGI_GRU is not set
755CONFIG_HAVE_IDE=y 762CONFIG_HAVE_IDE=y
756# CONFIG_IDE is not set 763# CONFIG_IDE is not set
757 764
@@ -790,12 +797,13 @@ CONFIG_SCSI_WAIT_SCAN=m
790# 797#
791CONFIG_SCSI_SPI_ATTRS=y 798CONFIG_SCSI_SPI_ATTRS=y
792# CONFIG_SCSI_FC_ATTRS is not set 799# CONFIG_SCSI_FC_ATTRS is not set
793# CONFIG_SCSI_ISCSI_ATTRS is not set 800CONFIG_SCSI_ISCSI_ATTRS=y
794# CONFIG_SCSI_SAS_ATTRS is not set 801# CONFIG_SCSI_SAS_ATTRS is not set
795# CONFIG_SCSI_SAS_LIBSAS is not set 802# CONFIG_SCSI_SAS_LIBSAS is not set
796# CONFIG_SCSI_SRP_ATTRS is not set 803# CONFIG_SCSI_SRP_ATTRS is not set
797# CONFIG_SCSI_LOWLEVEL is not set 804# CONFIG_SCSI_LOWLEVEL is not set
798# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 805# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
806# CONFIG_SCSI_DH is not set
799CONFIG_ATA=y 807CONFIG_ATA=y
800# CONFIG_ATA_NONSTANDARD is not set 808# CONFIG_ATA_NONSTANDARD is not set
801CONFIG_ATA_ACPI=y 809CONFIG_ATA_ACPI=y
@@ -857,6 +865,7 @@ CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_SIS is not set 865# CONFIG_PATA_SIS is not set
858# CONFIG_PATA_VIA is not set 866# CONFIG_PATA_VIA is not set
859# CONFIG_PATA_WINBOND is not set 867# CONFIG_PATA_WINBOND is not set
868CONFIG_PATA_SCH=y
860CONFIG_MD=y 869CONFIG_MD=y
861CONFIG_BLK_DEV_MD=y 870CONFIG_BLK_DEV_MD=y
862# CONFIG_MD_LINEAR is not set 871# CONFIG_MD_LINEAR is not set
@@ -880,13 +889,16 @@ CONFIG_DM_ZERO=y
880# 889#
881# IEEE 1394 (FireWire) support 890# IEEE 1394 (FireWire) support
882# 891#
892
893#
894# Enable only one of the two stacks, unless you know what you are doing
895#
883# CONFIG_FIREWIRE is not set 896# CONFIG_FIREWIRE is not set
884# CONFIG_IEEE1394 is not set 897# CONFIG_IEEE1394 is not set
885# CONFIG_I2O is not set 898# CONFIG_I2O is not set
886CONFIG_MACINTOSH_DRIVERS=y 899CONFIG_MACINTOSH_DRIVERS=y
887CONFIG_MAC_EMUMOUSEBTN=y 900CONFIG_MAC_EMUMOUSEBTN=y
888CONFIG_NETDEVICES=y 901CONFIG_NETDEVICES=y
889# CONFIG_NETDEVICES_MULTIQUEUE is not set
890# CONFIG_IFB is not set 902# CONFIG_IFB is not set
891# CONFIG_DUMMY is not set 903# CONFIG_DUMMY is not set
892# CONFIG_BONDING is not set 904# CONFIG_BONDING is not set
@@ -896,7 +908,23 @@ CONFIG_NETDEVICES=y
896# CONFIG_VETH is not set 908# CONFIG_VETH is not set
897# CONFIG_NET_SB1000 is not set 909# CONFIG_NET_SB1000 is not set
898# CONFIG_ARCNET is not set 910# CONFIG_ARCNET is not set
899# CONFIG_PHYLIB is not set 911CONFIG_PHYLIB=y
912
913#
914# MII PHY device drivers
915#
916# CONFIG_MARVELL_PHY is not set
917# CONFIG_DAVICOM_PHY is not set
918# CONFIG_QSEMI_PHY is not set
919# CONFIG_LXT_PHY is not set
920# CONFIG_CICADA_PHY is not set
921# CONFIG_VITESSE_PHY is not set
922# CONFIG_SMSC_PHY is not set
923# CONFIG_BROADCOM_PHY is not set
924# CONFIG_ICPLUS_PHY is not set
925# CONFIG_REALTEK_PHY is not set
926# CONFIG_FIXED_PHY is not set
927# CONFIG_MDIO_BITBANG is not set
900CONFIG_NET_ETHERNET=y 928CONFIG_NET_ETHERNET=y
901CONFIG_MII=y 929CONFIG_MII=y
902# CONFIG_HAPPYMEAL is not set 930# CONFIG_HAPPYMEAL is not set
@@ -940,16 +968,15 @@ CONFIG_8139TOO_PIO=y
940# CONFIG_SIS900 is not set 968# CONFIG_SIS900 is not set
941# CONFIG_EPIC100 is not set 969# CONFIG_EPIC100 is not set
942# CONFIG_SUNDANCE is not set 970# CONFIG_SUNDANCE is not set
971# CONFIG_TLAN is not set
943# CONFIG_VIA_RHINE is not set 972# CONFIG_VIA_RHINE is not set
944# CONFIG_SC92031 is not set 973# CONFIG_SC92031 is not set
945CONFIG_NETDEV_1000=y 974CONFIG_NETDEV_1000=y
946# CONFIG_ACENIC is not set 975# CONFIG_ACENIC is not set
947# CONFIG_DL2K is not set 976# CONFIG_DL2K is not set
948CONFIG_E1000=y 977CONFIG_E1000=y
949# CONFIG_E1000_NAPI is not set
950# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 978# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
951# CONFIG_E1000E is not set 979# CONFIG_E1000E is not set
952# CONFIG_E1000E_ENABLED is not set
953# CONFIG_IP1000 is not set 980# CONFIG_IP1000 is not set
954# CONFIG_IGB is not set 981# CONFIG_IGB is not set
955# CONFIG_NS83820 is not set 982# CONFIG_NS83820 is not set
@@ -965,6 +992,7 @@ CONFIG_TIGON3=y
965# CONFIG_BNX2 is not set 992# CONFIG_BNX2 is not set
966# CONFIG_QLA3XXX is not set 993# CONFIG_QLA3XXX is not set
967# CONFIG_ATL1 is not set 994# CONFIG_ATL1 is not set
995# CONFIG_ATL1E is not set
968CONFIG_NETDEV_10000=y 996CONFIG_NETDEV_10000=y
969# CONFIG_CHELSIO_T1 is not set 997# CONFIG_CHELSIO_T1 is not set
970# CONFIG_CHELSIO_T3 is not set 998# CONFIG_CHELSIO_T3 is not set
@@ -1003,13 +1031,14 @@ CONFIG_WLAN_80211=y
1003# CONFIG_RTL8180 is not set 1031# CONFIG_RTL8180 is not set
1004# CONFIG_RTL8187 is not set 1032# CONFIG_RTL8187 is not set
1005# CONFIG_ADM8211 is not set 1033# CONFIG_ADM8211 is not set
1034# CONFIG_MAC80211_HWSIM is not set
1006# CONFIG_P54_COMMON is not set 1035# CONFIG_P54_COMMON is not set
1007CONFIG_ATH5K=y 1036CONFIG_ATH5K=y
1008# CONFIG_ATH5K_DEBUG is not set 1037# CONFIG_ATH5K_DEBUG is not set
1009# CONFIG_IWLWIFI is not set 1038# CONFIG_ATH9K is not set
1010# CONFIG_IWLCORE is not set 1039# CONFIG_IWLCORE is not set
1011# CONFIG_IWLWIFI_LEDS is not set 1040# CONFIG_IWLWIFI_LEDS is not set
1012# CONFIG_IWL4965 is not set 1041# CONFIG_IWLAGN is not set
1013# CONFIG_IWL3945 is not set 1042# CONFIG_IWL3945 is not set
1014# CONFIG_HOSTAP is not set 1043# CONFIG_HOSTAP is not set
1015# CONFIG_B43 is not set 1044# CONFIG_B43 is not set
@@ -1088,6 +1117,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1088# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1117# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1089# CONFIG_MOUSE_SERIAL is not set 1118# CONFIG_MOUSE_SERIAL is not set
1090# CONFIG_MOUSE_APPLETOUCH is not set 1119# CONFIG_MOUSE_APPLETOUCH is not set
1120# CONFIG_MOUSE_BCM5974 is not set
1091# CONFIG_MOUSE_VSXXXAA is not set 1121# CONFIG_MOUSE_VSXXXAA is not set
1092CONFIG_INPUT_JOYSTICK=y 1122CONFIG_INPUT_JOYSTICK=y
1093# CONFIG_JOYSTICK_ANALOG is not set 1123# CONFIG_JOYSTICK_ANALOG is not set
@@ -1122,12 +1152,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1122# CONFIG_TOUCHSCREEN_GUNZE is not set 1152# CONFIG_TOUCHSCREEN_GUNZE is not set
1123# CONFIG_TOUCHSCREEN_ELO is not set 1153# CONFIG_TOUCHSCREEN_ELO is not set
1124# CONFIG_TOUCHSCREEN_MTOUCH is not set 1154# CONFIG_TOUCHSCREEN_MTOUCH is not set
1155# CONFIG_TOUCHSCREEN_INEXIO is not set
1125# CONFIG_TOUCHSCREEN_MK712 is not set 1156# CONFIG_TOUCHSCREEN_MK712 is not set
1126# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1157# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1127# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1158# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1128# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1159# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1129# CONFIG_TOUCHSCREEN_UCB1400 is not set 1160# CONFIG_TOUCHSCREEN_UCB1400 is not set
1130# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1161# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1162# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1131CONFIG_INPUT_MISC=y 1163CONFIG_INPUT_MISC=y
1132# CONFIG_INPUT_PCSPKR is not set 1164# CONFIG_INPUT_PCSPKR is not set
1133# CONFIG_INPUT_APANEL is not set 1165# CONFIG_INPUT_APANEL is not set
@@ -1155,6 +1187,7 @@ CONFIG_SERIO_LIBPS2=y
1155# Character devices 1187# Character devices
1156# 1188#
1157CONFIG_VT=y 1189CONFIG_VT=y
1190CONFIG_CONSOLE_TRANSLATIONS=y
1158CONFIG_VT_CONSOLE=y 1191CONFIG_VT_CONSOLE=y
1159CONFIG_HW_CONSOLE=y 1192CONFIG_HW_CONSOLE=y
1160CONFIG_VT_HW_CONSOLE_BINDING=y 1193CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1222,7 +1255,6 @@ CONFIG_NVRAM=y
1222# CONFIG_PC8736x_GPIO is not set 1255# CONFIG_PC8736x_GPIO is not set
1223# CONFIG_RAW_DRIVER is not set 1256# CONFIG_RAW_DRIVER is not set
1224CONFIG_HPET=y 1257CONFIG_HPET=y
1225# CONFIG_HPET_RTC_IRQ is not set
1226# CONFIG_HPET_MMAP is not set 1258# CONFIG_HPET_MMAP is not set
1227# CONFIG_HANGCHECK_TIMER is not set 1259# CONFIG_HANGCHECK_TIMER is not set
1228# CONFIG_TCG_TPM is not set 1260# CONFIG_TCG_TPM is not set
@@ -1231,42 +1263,63 @@ CONFIG_DEVPORT=y
1231CONFIG_I2C=y 1263CONFIG_I2C=y
1232CONFIG_I2C_BOARDINFO=y 1264CONFIG_I2C_BOARDINFO=y
1233# CONFIG_I2C_CHARDEV is not set 1265# CONFIG_I2C_CHARDEV is not set
1266CONFIG_I2C_HELPER_AUTO=y
1234 1267
1235# 1268#
1236# I2C Hardware Bus support 1269# I2C Hardware Bus support
1237# 1270#
1271
1272#
1273# PC SMBus host controller drivers
1274#
1238# CONFIG_I2C_ALI1535 is not set 1275# CONFIG_I2C_ALI1535 is not set
1239# CONFIG_I2C_ALI1563 is not set 1276# CONFIG_I2C_ALI1563 is not set
1240# CONFIG_I2C_ALI15X3 is not set 1277# CONFIG_I2C_ALI15X3 is not set
1241# CONFIG_I2C_AMD756 is not set 1278# CONFIG_I2C_AMD756 is not set
1242# CONFIG_I2C_AMD8111 is not set 1279# CONFIG_I2C_AMD8111 is not set
1243CONFIG_I2C_I801=y 1280CONFIG_I2C_I801=y
1244# CONFIG_I2C_I810 is not set 1281# CONFIG_I2C_ISCH is not set
1245# CONFIG_I2C_PIIX4 is not set 1282# CONFIG_I2C_PIIX4 is not set
1246# CONFIG_I2C_NFORCE2 is not set 1283# CONFIG_I2C_NFORCE2 is not set
1247# CONFIG_I2C_OCORES is not set
1248# CONFIG_I2C_PARPORT_LIGHT is not set
1249# CONFIG_I2C_PROSAVAGE is not set
1250# CONFIG_I2C_SAVAGE4 is not set
1251# CONFIG_I2C_SIMTEC is not set
1252# CONFIG_I2C_SIS5595 is not set 1284# CONFIG_I2C_SIS5595 is not set
1253# CONFIG_I2C_SIS630 is not set 1285# CONFIG_I2C_SIS630 is not set
1254# CONFIG_I2C_SIS96X is not set 1286# CONFIG_I2C_SIS96X is not set
1255# CONFIG_I2C_TAOS_EVM is not set
1256# CONFIG_I2C_STUB is not set
1257# CONFIG_I2C_TINY_USB is not set
1258# CONFIG_I2C_VIA is not set 1287# CONFIG_I2C_VIA is not set
1259# CONFIG_I2C_VIAPRO is not set 1288# CONFIG_I2C_VIAPRO is not set
1289
1290#
1291# I2C system bus drivers (mostly embedded / system-on-chip)
1292#
1293# CONFIG_I2C_OCORES is not set
1294# CONFIG_I2C_SIMTEC is not set
1295
1296#
1297# External I2C/SMBus adapter drivers
1298#
1299# CONFIG_I2C_PARPORT_LIGHT is not set
1300# CONFIG_I2C_TAOS_EVM is not set
1301# CONFIG_I2C_TINY_USB is not set
1302
1303#
1304# Graphics adapter I2C/DDC channel drivers
1305#
1260# CONFIG_I2C_VOODOO3 is not set 1306# CONFIG_I2C_VOODOO3 is not set
1307
1308#
1309# Other I2C/SMBus bus drivers
1310#
1261# CONFIG_I2C_PCA_PLATFORM is not set 1311# CONFIG_I2C_PCA_PLATFORM is not set
1312# CONFIG_I2C_STUB is not set
1262 1313
1263# 1314#
1264# Miscellaneous I2C Chip support 1315# Miscellaneous I2C Chip support
1265# 1316#
1266# CONFIG_DS1682 is not set 1317# CONFIG_DS1682 is not set
1318# CONFIG_AT24 is not set
1267# CONFIG_SENSORS_EEPROM is not set 1319# CONFIG_SENSORS_EEPROM is not set
1268# CONFIG_SENSORS_PCF8574 is not set 1320# CONFIG_SENSORS_PCF8574 is not set
1269# CONFIG_PCF8575 is not set 1321# CONFIG_PCF8575 is not set
1322# CONFIG_SENSORS_PCA9539 is not set
1270# CONFIG_SENSORS_PCF8591 is not set 1323# CONFIG_SENSORS_PCF8591 is not set
1271# CONFIG_SENSORS_MAX6875 is not set 1324# CONFIG_SENSORS_MAX6875 is not set
1272# CONFIG_SENSORS_TSL2550 is not set 1325# CONFIG_SENSORS_TSL2550 is not set
@@ -1275,6 +1328,8 @@ CONFIG_I2C_I801=y
1275# CONFIG_I2C_DEBUG_BUS is not set 1328# CONFIG_I2C_DEBUG_BUS is not set
1276# CONFIG_I2C_DEBUG_CHIP is not set 1329# CONFIG_I2C_DEBUG_CHIP is not set
1277# CONFIG_SPI is not set 1330# CONFIG_SPI is not set
1331CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1332# CONFIG_GPIOLIB is not set
1278# CONFIG_W1 is not set 1333# CONFIG_W1 is not set
1279CONFIG_POWER_SUPPLY=y 1334CONFIG_POWER_SUPPLY=y
1280# CONFIG_POWER_SUPPLY_DEBUG is not set 1335# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1335,8 +1390,10 @@ CONFIG_SSB_POSSIBLE=y
1335# 1390#
1336# Multifunction device drivers 1391# Multifunction device drivers
1337# 1392#
1393# CONFIG_MFD_CORE is not set
1338# CONFIG_MFD_SM501 is not set 1394# CONFIG_MFD_SM501 is not set
1339# CONFIG_HTC_PASIC3 is not set 1395# CONFIG_HTC_PASIC3 is not set
1396# CONFIG_MFD_TMIO is not set
1340 1397
1341# 1398#
1342# Multimedia devices 1399# Multimedia devices
@@ -1347,6 +1404,7 @@ CONFIG_SSB_POSSIBLE=y
1347# 1404#
1348# CONFIG_VIDEO_DEV is not set 1405# CONFIG_VIDEO_DEV is not set
1349# CONFIG_DVB_CORE is not set 1406# CONFIG_DVB_CORE is not set
1407# CONFIG_VIDEO_MEDIA is not set
1350 1408
1351# 1409#
1352# Multimedia drivers 1410# Multimedia drivers
@@ -1387,7 +1445,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1387# CONFIG_FB_SYS_IMAGEBLIT is not set 1445# CONFIG_FB_SYS_IMAGEBLIT is not set
1388# CONFIG_FB_FOREIGN_ENDIAN is not set 1446# CONFIG_FB_FOREIGN_ENDIAN is not set
1389# CONFIG_FB_SYS_FOPS is not set 1447# CONFIG_FB_SYS_FOPS is not set
1390CONFIG_FB_DEFERRED_IO=y
1391# CONFIG_FB_SVGALIB is not set 1448# CONFIG_FB_SVGALIB is not set
1392# CONFIG_FB_MACMODES is not set 1449# CONFIG_FB_MACMODES is not set
1393# CONFIG_FB_BACKLIGHT is not set 1450# CONFIG_FB_BACKLIGHT is not set
@@ -1430,6 +1487,7 @@ CONFIG_FB_EFI=y
1430# CONFIG_FB_TRIDENT is not set 1487# CONFIG_FB_TRIDENT is not set
1431# CONFIG_FB_ARK is not set 1488# CONFIG_FB_ARK is not set
1432# CONFIG_FB_PM3 is not set 1489# CONFIG_FB_PM3 is not set
1490# CONFIG_FB_CARMINE is not set
1433# CONFIG_FB_GEODE is not set 1491# CONFIG_FB_GEODE is not set
1434# CONFIG_FB_VIRTUAL is not set 1492# CONFIG_FB_VIRTUAL is not set
1435CONFIG_BACKLIGHT_LCD_SUPPORT=y 1493CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1437,6 +1495,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1437CONFIG_BACKLIGHT_CLASS_DEVICE=y 1495CONFIG_BACKLIGHT_CLASS_DEVICE=y
1438# CONFIG_BACKLIGHT_CORGI is not set 1496# CONFIG_BACKLIGHT_CORGI is not set
1439# CONFIG_BACKLIGHT_PROGEAR is not set 1497# CONFIG_BACKLIGHT_PROGEAR is not set
1498# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1440 1499
1441# 1500#
1442# Display device support 1501# Display device support
@@ -1455,15 +1514,7 @@ CONFIG_LOGO=y
1455# CONFIG_LOGO_LINUX_MONO is not set 1514# CONFIG_LOGO_LINUX_MONO is not set
1456# CONFIG_LOGO_LINUX_VGA16 is not set 1515# CONFIG_LOGO_LINUX_VGA16 is not set
1457CONFIG_LOGO_LINUX_CLUT224=y 1516CONFIG_LOGO_LINUX_CLUT224=y
1458
1459#
1460# Sound
1461#
1462CONFIG_SOUND=y 1517CONFIG_SOUND=y
1463
1464#
1465# Advanced Linux Sound Architecture
1466#
1467CONFIG_SND=y 1518CONFIG_SND=y
1468CONFIG_SND_TIMER=y 1519CONFIG_SND_TIMER=y
1469CONFIG_SND_PCM=y 1520CONFIG_SND_PCM=y
@@ -1481,20 +1532,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1481# CONFIG_SND_VERBOSE_PRINTK is not set 1532# CONFIG_SND_VERBOSE_PRINTK is not set
1482# CONFIG_SND_DEBUG is not set 1533# CONFIG_SND_DEBUG is not set
1483CONFIG_SND_VMASTER=y 1534CONFIG_SND_VMASTER=y
1484 1535CONFIG_SND_DRIVERS=y
1485#
1486# Generic devices
1487#
1488# CONFIG_SND_PCSP is not set 1536# CONFIG_SND_PCSP is not set
1489# CONFIG_SND_DUMMY is not set 1537# CONFIG_SND_DUMMY is not set
1490# CONFIG_SND_VIRMIDI is not set 1538# CONFIG_SND_VIRMIDI is not set
1491# CONFIG_SND_MTPAV is not set 1539# CONFIG_SND_MTPAV is not set
1492# CONFIG_SND_SERIAL_U16550 is not set 1540# CONFIG_SND_SERIAL_U16550 is not set
1493# CONFIG_SND_MPU401 is not set 1541# CONFIG_SND_MPU401 is not set
1494 1542CONFIG_SND_PCI=y
1495#
1496# PCI devices
1497#
1498# CONFIG_SND_AD1889 is not set 1543# CONFIG_SND_AD1889 is not set
1499# CONFIG_SND_ALS300 is not set 1544# CONFIG_SND_ALS300 is not set
1500# CONFIG_SND_ALS4000 is not set 1545# CONFIG_SND_ALS4000 is not set
@@ -1567,36 +1612,14 @@ CONFIG_SND_HDA_GENERIC=y
1567# CONFIG_SND_VIRTUOSO is not set 1612# CONFIG_SND_VIRTUOSO is not set
1568# CONFIG_SND_VX222 is not set 1613# CONFIG_SND_VX222 is not set
1569# CONFIG_SND_YMFPCI is not set 1614# CONFIG_SND_YMFPCI is not set
1570 1615CONFIG_SND_USB=y
1571#
1572# USB devices
1573#
1574# CONFIG_SND_USB_AUDIO is not set 1616# CONFIG_SND_USB_AUDIO is not set
1575# CONFIG_SND_USB_USX2Y is not set 1617# CONFIG_SND_USB_USX2Y is not set
1576# CONFIG_SND_USB_CAIAQ is not set 1618# CONFIG_SND_USB_CAIAQ is not set
1577 1619CONFIG_SND_PCMCIA=y
1578#
1579# PCMCIA devices
1580#
1581# CONFIG_SND_VXPOCKET is not set 1620# CONFIG_SND_VXPOCKET is not set
1582# CONFIG_SND_PDAUDIOCF is not set 1621# CONFIG_SND_PDAUDIOCF is not set
1583
1584#
1585# System on Chip audio support
1586#
1587# CONFIG_SND_SOC is not set 1622# CONFIG_SND_SOC is not set
1588
1589#
1590# ALSA SoC audio for Freescale SOCs
1591#
1592
1593#
1594# SoC Audio for the Texas Instruments OMAP
1595#
1596
1597#
1598# Open Sound System
1599#
1600# CONFIG_SOUND_PRIME is not set 1623# CONFIG_SOUND_PRIME is not set
1601CONFIG_HID_SUPPORT=y 1624CONFIG_HID_SUPPORT=y
1602CONFIG_HID=y 1625CONFIG_HID=y
@@ -1632,6 +1655,7 @@ CONFIG_USB_DEVICEFS=y
1632# CONFIG_USB_DYNAMIC_MINORS is not set 1655# CONFIG_USB_DYNAMIC_MINORS is not set
1633CONFIG_USB_SUSPEND=y 1656CONFIG_USB_SUSPEND=y
1634# CONFIG_USB_OTG is not set 1657# CONFIG_USB_OTG is not set
1658CONFIG_USB_MON=y
1635 1659
1636# 1660#
1637# USB Host Controller Drivers 1661# USB Host Controller Drivers
@@ -1655,6 +1679,7 @@ CONFIG_USB_UHCI_HCD=y
1655# 1679#
1656# CONFIG_USB_ACM is not set 1680# CONFIG_USB_ACM is not set
1657CONFIG_USB_PRINTER=y 1681CONFIG_USB_PRINTER=y
1682# CONFIG_USB_WDM is not set
1658 1683
1659# 1684#
1660# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1685# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1676,6 +1701,7 @@ CONFIG_USB_STORAGE=y
1676# CONFIG_USB_STORAGE_ALAUDA is not set 1701# CONFIG_USB_STORAGE_ALAUDA is not set
1677# CONFIG_USB_STORAGE_ONETOUCH is not set 1702# CONFIG_USB_STORAGE_ONETOUCH is not set
1678# CONFIG_USB_STORAGE_KARMA is not set 1703# CONFIG_USB_STORAGE_KARMA is not set
1704# CONFIG_USB_STORAGE_SIERRA is not set
1679# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1705# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1680CONFIG_USB_LIBUSUAL=y 1706CONFIG_USB_LIBUSUAL=y
1681 1707
@@ -1684,7 +1710,6 @@ CONFIG_USB_LIBUSUAL=y
1684# 1710#
1685# CONFIG_USB_MDC800 is not set 1711# CONFIG_USB_MDC800 is not set
1686# CONFIG_USB_MICROTEK is not set 1712# CONFIG_USB_MICROTEK is not set
1687CONFIG_USB_MON=y
1688 1713
1689# 1714#
1690# USB port drivers 1715# USB port drivers
@@ -1697,7 +1722,6 @@ CONFIG_USB_MON=y
1697# CONFIG_USB_EMI62 is not set 1722# CONFIG_USB_EMI62 is not set
1698# CONFIG_USB_EMI26 is not set 1723# CONFIG_USB_EMI26 is not set
1699# CONFIG_USB_ADUTUX is not set 1724# CONFIG_USB_ADUTUX is not set
1700# CONFIG_USB_AUERSWALD is not set
1701# CONFIG_USB_RIO500 is not set 1725# CONFIG_USB_RIO500 is not set
1702# CONFIG_USB_LEGOTOWER is not set 1726# CONFIG_USB_LEGOTOWER is not set
1703# CONFIG_USB_LCD is not set 1727# CONFIG_USB_LCD is not set
@@ -1714,6 +1738,7 @@ CONFIG_USB_MON=y
1714# CONFIG_USB_TRANCEVIBRATOR is not set 1738# CONFIG_USB_TRANCEVIBRATOR is not set
1715# CONFIG_USB_IOWARRIOR is not set 1739# CONFIG_USB_IOWARRIOR is not set
1716# CONFIG_USB_TEST is not set 1740# CONFIG_USB_TEST is not set
1741# CONFIG_USB_ISIGHTFW is not set
1717# CONFIG_USB_GADGET is not set 1742# CONFIG_USB_GADGET is not set
1718# CONFIG_MMC is not set 1743# CONFIG_MMC is not set
1719# CONFIG_MEMSTICK is not set 1744# CONFIG_MEMSTICK is not set
@@ -1723,7 +1748,9 @@ CONFIG_LEDS_CLASS=y
1723# 1748#
1724# LED drivers 1749# LED drivers
1725# 1750#
1751# CONFIG_LEDS_PCA9532 is not set
1726# CONFIG_LEDS_CLEVO_MAIL is not set 1752# CONFIG_LEDS_CLEVO_MAIL is not set
1753# CONFIG_LEDS_PCA955X is not set
1727 1754
1728# 1755#
1729# LED Triggers 1756# LED Triggers
@@ -1769,6 +1796,7 @@ CONFIG_RTC_INTF_DEV=y
1769# CONFIG_RTC_DRV_PCF8583 is not set 1796# CONFIG_RTC_DRV_PCF8583 is not set
1770# CONFIG_RTC_DRV_M41T80 is not set 1797# CONFIG_RTC_DRV_M41T80 is not set
1771# CONFIG_RTC_DRV_S35390A is not set 1798# CONFIG_RTC_DRV_S35390A is not set
1799# CONFIG_RTC_DRV_FM3130 is not set
1772 1800
1773# 1801#
1774# SPI RTC drivers 1802# SPI RTC drivers
@@ -1801,11 +1829,13 @@ CONFIG_DMADEVICES=y
1801# Firmware Drivers 1829# Firmware Drivers
1802# 1830#
1803# CONFIG_EDD is not set 1831# CONFIG_EDD is not set
1832CONFIG_FIRMWARE_MEMMAP=y
1804CONFIG_EFI_VARS=y 1833CONFIG_EFI_VARS=y
1805# CONFIG_DELL_RBU is not set 1834# CONFIG_DELL_RBU is not set
1806# CONFIG_DCDBAS is not set 1835# CONFIG_DCDBAS is not set
1807CONFIG_DMIID=y 1836CONFIG_DMIID=y
1808# CONFIG_ISCSI_IBFT_FIND is not set 1837CONFIG_ISCSI_IBFT_FIND=y
1838CONFIG_ISCSI_IBFT=y
1809 1839
1810# 1840#
1811# File systems 1841# File systems
@@ -1885,14 +1915,27 @@ CONFIG_HUGETLB_PAGE=y
1885# CONFIG_CRAMFS is not set 1915# CONFIG_CRAMFS is not set
1886# CONFIG_VXFS_FS is not set 1916# CONFIG_VXFS_FS is not set
1887# CONFIG_MINIX_FS is not set 1917# CONFIG_MINIX_FS is not set
1918# CONFIG_OMFS_FS is not set
1888# CONFIG_HPFS_FS is not set 1919# CONFIG_HPFS_FS is not set
1889# CONFIG_QNX4FS_FS is not set 1920# CONFIG_QNX4FS_FS is not set
1890# CONFIG_ROMFS_FS is not set 1921# CONFIG_ROMFS_FS is not set
1891# CONFIG_SYSV_FS is not set 1922# CONFIG_SYSV_FS is not set
1892# CONFIG_UFS_FS is not set 1923# CONFIG_UFS_FS is not set
1893CONFIG_NETWORK_FILESYSTEMS=y 1924CONFIG_NETWORK_FILESYSTEMS=y
1894# CONFIG_NFS_FS is not set 1925CONFIG_NFS_FS=y
1926CONFIG_NFS_V3=y
1927CONFIG_NFS_V3_ACL=y
1928CONFIG_NFS_V4=y
1929CONFIG_ROOT_NFS=y
1895# CONFIG_NFSD is not set 1930# CONFIG_NFSD is not set
1931CONFIG_LOCKD=y
1932CONFIG_LOCKD_V4=y
1933CONFIG_NFS_ACL_SUPPORT=y
1934CONFIG_NFS_COMMON=y
1935CONFIG_SUNRPC=y
1936CONFIG_SUNRPC_GSS=y
1937CONFIG_RPCSEC_GSS_KRB5=y
1938# CONFIG_RPCSEC_GSS_SPKM3 is not set
1896# CONFIG_SMB_FS is not set 1939# CONFIG_SMB_FS is not set
1897# CONFIG_CIFS is not set 1940# CONFIG_CIFS is not set
1898# CONFIG_NCP_FS is not set 1941# CONFIG_NCP_FS is not set
@@ -1966,9 +2009,9 @@ CONFIG_NLS_UTF8=y
1966# Kernel hacking 2009# Kernel hacking
1967# 2010#
1968CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2011CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1969# CONFIG_PRINTK_TIME is not set 2012CONFIG_PRINTK_TIME=y
1970# CONFIG_ENABLE_WARN_DEPRECATED is not set 2013CONFIG_ENABLE_WARN_DEPRECATED=y
1971# CONFIG_ENABLE_MUST_CHECK is not set 2014CONFIG_ENABLE_MUST_CHECK=y
1972CONFIG_FRAME_WARN=2048 2015CONFIG_FRAME_WARN=2048
1973CONFIG_MAGIC_SYSRQ=y 2016CONFIG_MAGIC_SYSRQ=y
1974# CONFIG_UNUSED_SYMBOLS is not set 2017# CONFIG_UNUSED_SYMBOLS is not set
@@ -1997,6 +2040,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
1997# CONFIG_DEBUG_INFO is not set 2040# CONFIG_DEBUG_INFO is not set
1998# CONFIG_DEBUG_VM is not set 2041# CONFIG_DEBUG_VM is not set
1999# CONFIG_DEBUG_WRITECOUNT is not set 2042# CONFIG_DEBUG_WRITECOUNT is not set
2043CONFIG_DEBUG_MEMORY_INIT=y
2000# CONFIG_DEBUG_LIST is not set 2044# CONFIG_DEBUG_LIST is not set
2001# CONFIG_DEBUG_SG is not set 2045# CONFIG_DEBUG_SG is not set
2002CONFIG_FRAME_POINTER=y 2046CONFIG_FRAME_POINTER=y
@@ -2007,11 +2051,20 @@ CONFIG_FRAME_POINTER=y
2007# CONFIG_LKDTM is not set 2051# CONFIG_LKDTM is not set
2008# CONFIG_FAULT_INJECTION is not set 2052# CONFIG_FAULT_INJECTION is not set
2009# CONFIG_LATENCYTOP is not set 2053# CONFIG_LATENCYTOP is not set
2054CONFIG_SYSCTL_SYSCALL_CHECK=y
2055CONFIG_HAVE_FTRACE=y
2056CONFIG_HAVE_DYNAMIC_FTRACE=y
2057# CONFIG_FTRACE is not set
2058# CONFIG_IRQSOFF_TRACER is not set
2059# CONFIG_SYSPROF_TRACER is not set
2060# CONFIG_SCHED_TRACER is not set
2061# CONFIG_CONTEXT_SWITCH_TRACER is not set
2010CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2062CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2011# CONFIG_SAMPLES is not set 2063# CONFIG_SAMPLES is not set
2012# CONFIG_KGDB is not set
2013CONFIG_HAVE_ARCH_KGDB=y 2064CONFIG_HAVE_ARCH_KGDB=y
2014# CONFIG_NONPROMISC_DEVMEM is not set 2065# CONFIG_KGDB is not set
2066# CONFIG_STRICT_DEVMEM is not set
2067CONFIG_X86_VERBOSE_BOOTUP=y
2015CONFIG_EARLY_PRINTK=y 2068CONFIG_EARLY_PRINTK=y
2016CONFIG_DEBUG_STACKOVERFLOW=y 2069CONFIG_DEBUG_STACKOVERFLOW=y
2017CONFIG_DEBUG_STACK_USAGE=y 2070CONFIG_DEBUG_STACK_USAGE=y
@@ -2022,8 +2075,8 @@ CONFIG_DEBUG_RODATA=y
2022# CONFIG_DIRECT_GBPAGES is not set 2075# CONFIG_DIRECT_GBPAGES is not set
2023# CONFIG_DEBUG_RODATA_TEST is not set 2076# CONFIG_DEBUG_RODATA_TEST is not set
2024CONFIG_DEBUG_NX_TEST=m 2077CONFIG_DEBUG_NX_TEST=m
2025CONFIG_X86_MPPARSE=y
2026# CONFIG_IOMMU_DEBUG is not set 2078# CONFIG_IOMMU_DEBUG is not set
2079# CONFIG_MMIOTRACE is not set
2027CONFIG_IO_DELAY_TYPE_0X80=0 2080CONFIG_IO_DELAY_TYPE_0X80=0
2028CONFIG_IO_DELAY_TYPE_0XED=1 2081CONFIG_IO_DELAY_TYPE_0XED=1
2029CONFIG_IO_DELAY_TYPE_UDELAY=2 2082CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2035,6 +2088,7 @@ CONFIG_IO_DELAY_0X80=y
2035CONFIG_DEFAULT_IO_DELAY_TYPE=0 2088CONFIG_DEFAULT_IO_DELAY_TYPE=0
2036CONFIG_DEBUG_BOOT_PARAMS=y 2089CONFIG_DEBUG_BOOT_PARAMS=y
2037# CONFIG_CPA_DEBUG is not set 2090# CONFIG_CPA_DEBUG is not set
2091# CONFIG_OPTIMIZE_INLINING is not set
2038 2092
2039# 2093#
2040# Security options 2094# Security options
@@ -2044,7 +2098,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2044CONFIG_SECURITY=y 2098CONFIG_SECURITY=y
2045CONFIG_SECURITY_NETWORK=y 2099CONFIG_SECURITY_NETWORK=y
2046# CONFIG_SECURITY_NETWORK_XFRM is not set 2100# CONFIG_SECURITY_NETWORK_XFRM is not set
2047CONFIG_SECURITY_CAPABILITIES=y
2048CONFIG_SECURITY_FILE_CAPABILITIES=y 2101CONFIG_SECURITY_FILE_CAPABILITIES=y
2049# CONFIG_SECURITY_ROOTPLUG is not set 2102# CONFIG_SECURITY_ROOTPLUG is not set
2050CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2103CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2105,6 +2158,10 @@ CONFIG_CRYPTO_HMAC=y
2105# CONFIG_CRYPTO_MD4 is not set 2158# CONFIG_CRYPTO_MD4 is not set
2106CONFIG_CRYPTO_MD5=y 2159CONFIG_CRYPTO_MD5=y
2107# CONFIG_CRYPTO_MICHAEL_MIC is not set 2160# CONFIG_CRYPTO_MICHAEL_MIC is not set
2161# CONFIG_CRYPTO_RMD128 is not set
2162# CONFIG_CRYPTO_RMD160 is not set
2163# CONFIG_CRYPTO_RMD256 is not set
2164# CONFIG_CRYPTO_RMD320 is not set
2108CONFIG_CRYPTO_SHA1=y 2165CONFIG_CRYPTO_SHA1=y
2109# CONFIG_CRYPTO_SHA256 is not set 2166# CONFIG_CRYPTO_SHA256 is not set
2110# CONFIG_CRYPTO_SHA512 is not set 2167# CONFIG_CRYPTO_SHA512 is not set
@@ -2154,6 +2211,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2154CONFIG_GENERIC_FIND_NEXT_BIT=y 2211CONFIG_GENERIC_FIND_NEXT_BIT=y
2155# CONFIG_CRC_CCITT is not set 2212# CONFIG_CRC_CCITT is not set
2156# CONFIG_CRC16 is not set 2213# CONFIG_CRC16 is not set
2214CONFIG_CRC_T10DIF=y
2157# CONFIG_CRC_ITU_T is not set 2215# CONFIG_CRC_ITU_T is not set
2158CONFIG_CRC32=y 2216CONFIG_CRC32=y
2159# CONFIG_CRC7 is not set 2217# CONFIG_CRC7 is not set
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..a0e1dbe67dc1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 441 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 443 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 444 return 0;
451} 445}
452 446
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..20af4c79579a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 253 regs->ss |= 3;
249 254
250 err |= __get_user(tmpflags, &sc->flags); 255 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 256 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 257 /* disable syscall checks */
253 regs->orig_ax = -1; 258 regs->orig_ax = -1;
254 259
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 520 compat_sigset_t *set, struct pt_regs *regs)
516{ 521{
517 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 523 void __user *restorer;
520 int err = 0; 524 int err = 0;
521 525
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 543 goto give_sigsegv;
540 544
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 545 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 548 err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..ffc1bb4fed7d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -37,6 +47,11 @@
37 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
38 .endm 48 .endm
39 49
50 /*
51 * Reload arg registers from stack in case ptrace changed them.
52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup.
54 */
40 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset
41 movl \offset(%rsp),%r11d 56 movl \offset(%rsp),%r11d
42 movl \offset+8(%rsp),%r10d 57 movl \offset+8(%rsp),%r10d
@@ -46,7 +61,6 @@
46 movl \offset+48(%rsp),%edx 61 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 62 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 63 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 64 .endm
51 65
52 .macro CFI_STARTPROC32 simple 66 .macro CFI_STARTPROC32 simple
@@ -137,21 +151,22 @@ ENTRY(ia32_sysenter_target)
137 .previous 151 .previous
138 GET_THREAD_INFO(%r10) 152 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,TI_status(%r10) 153 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 154 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
141 TI_flags(%r10)
142 CFI_REMEMBER_STATE 155 CFI_REMEMBER_STATE
143 jnz sysenter_tracesys 156 jnz sysenter_tracesys
144sysenter_do_call:
145 cmpl $(IA32_NR_syscalls-1),%eax 157 cmpl $(IA32_NR_syscalls-1),%eax
146 ja ia32_badsys 158 ja ia32_badsys
159sysenter_do_call:
147 IA32_ARG_FIXUP 1 160 IA32_ARG_FIXUP 1
161sysenter_dispatch:
148 call *ia32_sys_call_table(,%rax,8) 162 call *ia32_sys_call_table(,%rax,8)
149 movq %rax,RAX-ARGOFFSET(%rsp) 163 movq %rax,RAX-ARGOFFSET(%rsp)
150 GET_THREAD_INFO(%r10) 164 GET_THREAD_INFO(%r10)
151 DISABLE_INTERRUPTS(CLBR_NONE) 165 DISABLE_INTERRUPTS(CLBR_NONE)
152 TRACE_IRQS_OFF 166 TRACE_IRQS_OFF
153 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 167 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
154 jnz int_ret_from_sys_call 168 jnz sysexit_audit
169sysexit_from_sys_call:
155 andl $~TS_COMPAT,TI_status(%r10) 170 andl $~TS_COMPAT,TI_status(%r10)
156 /* clear IF, that popfq doesn't enable interrupts early */ 171 /* clear IF, that popfq doesn't enable interrupts early */
157 andl $~0x200,EFLAGS-R11(%rsp) 172 andl $~0x200,EFLAGS-R11(%rsp)
@@ -167,9 +182,63 @@ sysenter_do_call:
167 TRACE_IRQS_ON 182 TRACE_IRQS_ON
168 ENABLE_INTERRUPTS_SYSEXIT32 183 ENABLE_INTERRUPTS_SYSEXIT32
169 184
170sysenter_tracesys: 185#ifdef CONFIG_AUDITSYSCALL
186 .macro auditsys_entry_common
187 movl %esi,%r9d /* 6th arg: 4th syscall arg */
188 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
189 /* (already in %ecx) 4th arg: 2nd syscall arg */
190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
191 movl %eax,%esi /* 2nd arg: syscall number */
192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
193 call audit_syscall_entry
194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
195 cmpl $(IA32_NR_syscalls-1),%eax
196 ja ia32_badsys
197 movl %ebx,%edi /* reload 1st syscall arg */
198 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
199 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
200 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
201 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
202 .endm
203
204 .macro auditsys_exit exit,ebpsave=RBP
205 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
206 jnz int_ret_from_sys_call
207 TRACE_IRQS_ON
208 sti
209 movl %eax,%esi /* second arg, syscall return value */
210 cmpl $0,%eax /* is it < 0? */
211 setl %al /* 1 if so, 0 if not */
212 movzbl %al,%edi /* zero-extend that into %edi */
213 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
214 call audit_syscall_exit
215 GET_THREAD_INFO(%r10)
216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
217 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
218 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
219 cli
220 TRACE_IRQS_OFF
221 testl %edi,TI_flags(%r10)
222 jnz int_with_check
223 jmp \exit
224 .endm
225
226sysenter_auditsys:
171 CFI_RESTORE_STATE 227 CFI_RESTORE_STATE
228 auditsys_entry_common
229 movl %ebp,%r9d /* reload 6th syscall arg */
230 jmp sysenter_dispatch
231
232sysexit_audit:
233 auditsys_exit sysexit_from_sys_call
234#endif
235
236sysenter_tracesys:
172 xchgl %r9d,%ebp 237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys
241#endif
173 SAVE_REST 242 SAVE_REST
174 CLEAR_RREGS 243 CLEAR_RREGS
175 movq %r9,R9(%rsp) 244 movq %r9,R9(%rsp)
@@ -242,21 +311,22 @@ ENTRY(ia32_cstar_target)
242 .previous 311 .previous
243 GET_THREAD_INFO(%r10) 312 GET_THREAD_INFO(%r10)
244 orl $TS_COMPAT,TI_status(%r10) 313 orl $TS_COMPAT,TI_status(%r10)
245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
246 TI_flags(%r10)
247 CFI_REMEMBER_STATE 315 CFI_REMEMBER_STATE
248 jnz cstar_tracesys 316 jnz cstar_tracesys
249cstar_do_call: 317cstar_do_call:
250 cmpl $IA32_NR_syscalls-1,%eax 318 cmpl $IA32_NR_syscalls-1,%eax
251 ja ia32_badsys 319 ja ia32_badsys
252 IA32_ARG_FIXUP 1 320 IA32_ARG_FIXUP 1
321cstar_dispatch:
253 call *ia32_sys_call_table(,%rax,8) 322 call *ia32_sys_call_table(,%rax,8)
254 movq %rax,RAX-ARGOFFSET(%rsp) 323 movq %rax,RAX-ARGOFFSET(%rsp)
255 GET_THREAD_INFO(%r10) 324 GET_THREAD_INFO(%r10)
256 DISABLE_INTERRUPTS(CLBR_NONE) 325 DISABLE_INTERRUPTS(CLBR_NONE)
257 TRACE_IRQS_OFF 326 TRACE_IRQS_OFF
258 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
259 jnz int_ret_from_sys_call 328 jnz sysretl_audit
329sysretl_from_sys_call:
260 andl $~TS_COMPAT,TI_status(%r10) 330 andl $~TS_COMPAT,TI_status(%r10)
261 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 331 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
262 movl RIP-ARGOFFSET(%rsp),%ecx 332 movl RIP-ARGOFFSET(%rsp),%ecx
@@ -268,8 +338,23 @@ cstar_do_call:
268 CFI_RESTORE rsp 338 CFI_RESTORE rsp
269 USERGS_SYSRET32 339 USERGS_SYSRET32
270 340
271cstar_tracesys: 341#ifdef CONFIG_AUDITSYSCALL
342cstar_auditsys:
272 CFI_RESTORE_STATE 343 CFI_RESTORE_STATE
344 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
345 auditsys_entry_common
346 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
347 jmp cstar_dispatch
348
349sysretl_audit:
350 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
351#endif
352
353cstar_tracesys:
354#ifdef CONFIG_AUDITSYSCALL
355 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
356 jz cstar_auditsys
357#endif
273 xchgl %r9d,%ebp 358 xchgl %r9d,%ebp
274 SAVE_REST 359 SAVE_REST
275 CLEAR_RREGS 360 CLEAR_RREGS
@@ -321,6 +406,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 406 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 407 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 408 CFI_REL_OFFSET rip,RIP-RIP
409 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 410 SWAPGS
325 /* 411 /*
326 * No need to follow this irqs on/off section: the syscall 412 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +422,7 @@ ENTRY(ia32_syscall)
336 SAVE_ARGS 0,0,1 422 SAVE_ARGS 0,0,1
337 GET_THREAD_INFO(%r10) 423 GET_THREAD_INFO(%r10)
338 orl $TS_COMPAT,TI_status(%r10) 424 orl $TS_COMPAT,TI_status(%r10)
339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
340 TI_flags(%r10)
341 jnz ia32_tracesys 426 jnz ia32_tracesys
342ia32_do_syscall: 427ia32_do_syscall:
343 cmpl $(IA32_NR_syscalls-1),%eax 428 cmpl $(IA32_NR_syscalls-1),%eax
@@ -741,4 +826,10 @@ ia32_sys_call_table:
741 .quad sys32_fallocate 826 .quad sys32_fallocate
742 .quad compat_sys_timerfd_settime /* 325 */ 827 .quad compat_sys_timerfd_settime /* 325 */
743 .quad compat_sys_timerfd_gettime 828 .quad compat_sys_timerfd_gettime
829 .quad compat_sys_signalfd4
830 .quad sys_eventfd2
831 .quad sys_epoll_create1
832 .quad sys_dup3 /* 330 */
833 .quad sys_pipe2
834 .quad sys_inotify_init1
744ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..d3c64088b981 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 238 int retval;
239 int fds[2]; 239 int fds[2];
240 240
241 retval = do_pipe(fds); 241 retval = do_pipe_flags(fds, 0);
242 if (retval) 242 if (retval)
243 goto out; 243 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 244 if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..3db651fc8ec5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
13endif 14endif
14 15
15# 16#
@@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o
102# 64 bit specific files 103# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 104ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 108 obj-$(CONFIG_AUDIT) += audit_64.o
107 109
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..bfd10fd211cd 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -97,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
97#warning ACPI uses CMPXCHG, i486 and later hardware 97#warning ACPI uses CMPXCHG, i486 and later hardware
98#endif 98#endif
99 99
100static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
101
100/* -------------------------------------------------------------------------- 102/* --------------------------------------------------------------------------
101 Boot-time Configuration 103 Boot-time Configuration
102 -------------------------------------------------------------------------- */ 104 -------------------------------------------------------------------------- */
@@ -158,6 +160,14 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
158struct acpi_mcfg_allocation *pci_mmcfg_config; 160struct acpi_mcfg_allocation *pci_mmcfg_config;
159int pci_mmcfg_config_num; 161int pci_mmcfg_config_num;
160 162
163static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
164{
165 if (!strcmp(mcfg->header.oem_id, "SGI"))
166 acpi_mcfg_64bit_base_addr = TRUE;
167
168 return 0;
169}
170
161int __init acpi_parse_mcfg(struct acpi_table_header *header) 171int __init acpi_parse_mcfg(struct acpi_table_header *header)
162{ 172{
163 struct acpi_table_mcfg *mcfg; 173 struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
190 } 200 }
191 201
192 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 202 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
203
204 acpi_mcfg_oem_check(mcfg);
205
193 for (i = 0; i < pci_mmcfg_config_num; ++i) { 206 for (i = 0; i < pci_mmcfg_config_num; ++i) {
194 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 207 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
208 !acpi_mcfg_64bit_base_addr) {
195 printk(KERN_ERR PREFIX 209 printk(KERN_ERR PREFIX
196 "MMCONFIG not in low 4GB of memory\n"); 210 "MMCONFIG not in low 4GB of memory\n");
197 kfree(pci_mmcfg_config); 211 kfree(pci_mmcfg_config);
@@ -1021,7 +1035,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1035 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif 1036#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci); 1037 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); 1038 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025 1039
1026#ifdef CONFIG_X86_ES7000 1040#ifdef CONFIG_X86_ES7000
1027 /* 1041 /*
@@ -1127,8 +1141,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1127 return gsi; 1141 return gsi;
1128 } 1142 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1143 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1144 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1145 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32 1146#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1147 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else 1148#else
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a3ddad18aaa3..426e5d91b63a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags;
20/* address in low memory of the wakeup routine. */ 20/* address in low memory of the wakeup routine. */
21static unsigned long acpi_realmode; 21static unsigned long acpi_realmode;
22 22
23#ifdef CONFIG_64BIT 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
24static char temp_stack[10240]; 24static char temp_stack[10240];
25#endif 25#endif
26 26
@@ -86,7 +86,7 @@ int acpi_save_state_mem(void)
86#endif /* !CONFIG_64BIT */ 86#endif /* !CONFIG_64BIT */
87 87
88 header->pmode_cr0 = read_cr0(); 88 header->pmode_cr0 = read_cr0();
89 header->pmode_cr4 = read_cr4(); 89 header->pmode_cr4 = read_cr4_safe();
90 header->realmode_flags = acpi_realmode_flags; 90 header->realmode_flags = acpi_realmode_flags;
91 header->real_magic = 0x12345678; 91 header->real_magic = 0x12345678;
92 92
@@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str)
150 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
151 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
152 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
153 if (strncmp(str, "old_ordering", 12) == 0) 157 if (strncmp(str, "old_ordering", 12) == 0)
154 acpi_old_suspend_ordering(); 158 acpi_old_suspend_ordering();
155 str = strchr(str, ','); 159 str = strchr(str, ',');
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..65a0c1b48696 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146const unsigned char *const *find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop {
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165const unsigned char *const *find_nop_table(void) 157const unsigned char *const *find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..042fdc27bc92 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,36 +23,49 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31 31
32#define to_pages(addr, size) \ 32#define EXIT_LOOP_COUNT 10000000
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 33
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 35
37struct command { 36/*
37 * general struct to manage commands send to an IOMMU
38 */
39struct iommu_cmd {
38 u32 data[4]; 40 u32 data[4];
39}; 41};
40 42
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 43static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 44 struct unity_map_entry *e);
43 45
46/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 47static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 48{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 49 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 50}
48 51
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 52/****************************************************************************
53 *
54 * IOMMU command queuing functions
55 *
56 ****************************************************************************/
57
58/*
59 * Writes the command to the IOMMUs command buffer and informs the
60 * hardware about the new command. Must be called with iommu->lock held.
61 */
62static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 63{
51 u32 tail, head; 64 u32 tail, head;
52 u8 *target; 65 u8 *target;
53 66
54 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 67 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
55 target = (iommu->cmd_buf + tail); 68 target = iommu->cmd_buf + tail;
56 memcpy_toio(target, cmd, sizeof(*cmd)); 69 memcpy_toio(target, cmd, sizeof(*cmd));
57 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 70 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
58 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 71 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -63,7 +76,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 76 return 0;
64} 77}
65 78
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 79/*
80 * General queuing function for commands. Takes iommu->lock and calls
81 * __iommu_queue_command().
82 */
83static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 84{
68 unsigned long flags; 85 unsigned long flags;
69 int ret; 86 int ret;
@@ -75,35 +92,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 92 return ret;
76} 93}
77 94
95/*
96 * This function is called whenever we need to ensure that the IOMMU has
97 * completed execution of all commands we sent. It sends a
98 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
99 * us about that by writing a value to a physical address we pass with
100 * the command.
101 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 102static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 103{
80 int ret; 104 int ret = 0, ready = 0;
81 struct command cmd; 105 unsigned status = 0;
82 volatile u64 ready = 0; 106 struct iommu_cmd cmd;
83 unsigned long ready_phys = virt_to_phys(&ready); 107 unsigned long flags, i = 0;
84 108
85 memset(&cmd, 0, sizeof(cmd)); 109 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 110 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 111 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 112
91 iommu->need_sync = 0; 113 iommu->need_sync = 0;
92 114
93 ret = iommu_queue_command(iommu, &cmd); 115 spin_lock_irqsave(&iommu->lock, flags);
116
117 ret = __iommu_queue_command(iommu, &cmd);
94 118
95 if (ret) 119 if (ret)
96 return ret; 120 goto out;
121
122 while (!ready && (i < EXIT_LOOP_COUNT)) {
123 ++i;
124 /* wait for the bit to become one */
125 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
126 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
127 }
97 128
98 while (!ready) 129 /* set bit back to zero */
99 cpu_relax(); 130 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
131 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
132
133 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
134 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
135out:
136 spin_unlock_irqrestore(&iommu->lock, flags);
100 137
101 return 0; 138 return 0;
102} 139}
103 140
141/*
142 * Command send function for invalidating a device table entry
143 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 144static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 145{
106 struct command cmd; 146 struct iommu_cmd cmd;
147 int ret;
107 148
108 BUG_ON(iommu == NULL); 149 BUG_ON(iommu == NULL);
109 150
@@ -111,37 +152,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
111 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 152 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
112 cmd.data[0] = devid; 153 cmd.data[0] = devid;
113 154
155 ret = iommu_queue_command(iommu, &cmd);
156
114 iommu->need_sync = 1; 157 iommu->need_sync = 1;
115 158
116 return iommu_queue_command(iommu, &cmd); 159 return ret;
117} 160}
118 161
162/*
163 * Generic command send function for invalidaing TLB entries
164 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 165static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 166 u64 address, u16 domid, int pde, int s)
121{ 167{
122 struct command cmd; 168 struct iommu_cmd cmd;
169 int ret;
123 170
124 memset(&cmd, 0, sizeof(cmd)); 171 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 172 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 173 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 174 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 175 cmd.data[2] = lower_32_bits(address);
129 cmd.data[3] = HIGH_U32(address); 176 cmd.data[3] = upper_32_bits(address);
130 if (s) 177 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 178 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 179 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 180 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 181
182 ret = iommu_queue_command(iommu, &cmd);
183
135 iommu->need_sync = 1; 184 iommu->need_sync = 1;
136 185
137 return iommu_queue_command(iommu, &cmd); 186 return ret;
138} 187}
139 188
189/*
190 * TLB invalidation function which is called from the mapping functions.
191 * It invalidates a single PTE if the range to flush is within a single
192 * page. Otherwise it flushes the whole TLB of the IOMMU.
193 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 194static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 195 u64 address, size_t size)
142{ 196{
143 int s = 0; 197 int s = 0;
144 unsigned pages = to_pages(address, size); 198 unsigned pages = iommu_num_pages(address, size);
145 199
146 address &= PAGE_MASK; 200 address &= PAGE_MASK;
147 201
@@ -159,6 +213,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 213 return 0;
160} 214}
161 215
216/****************************************************************************
217 *
218 * The functions below are used the create the page table mappings for
219 * unity mapped regions.
220 *
221 ****************************************************************************/
222
223/*
224 * Generic mapping functions. It maps a physical address into a DMA
225 * address space. It allocates the page table pages if necessary.
226 * In the future it can be extended to a generic mapping function
227 * supporting all features of AMD IOMMU page tables like level skipping
228 * and full 64 bit address spaces.
229 */
162static int iommu_map(struct protection_domain *dom, 230static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 231 unsigned long bus_addr,
164 unsigned long phys_addr, 232 unsigned long phys_addr,
@@ -209,6 +277,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 277 return 0;
210} 278}
211 279
280/*
281 * This function checks if a specific unity mapping entry is needed for
282 * this specific IOMMU.
283 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 284static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 285 struct unity_map_entry *entry)
214{ 286{
@@ -223,6 +295,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 295 return 0;
224} 296}
225 297
298/*
299 * Init the unity mappings for a specific IOMMU in the system
300 *
301 * Basically iterates over all unity mapping entries and applies them to
302 * the default domain DMA of that IOMMU if necessary.
303 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 304static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 305{
228 struct unity_map_entry *entry; 306 struct unity_map_entry *entry;
@@ -239,6 +317,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 317 return 0;
240} 318}
241 319
320/*
321 * This function actually applies the mapping to the page table of the
322 * dma_ops domain.
323 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 324static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 325 struct unity_map_entry *e)
244{ 326{
@@ -261,6 +343,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 343 return 0;
262} 344}
263 345
346/*
347 * Inits the unity mappings required for a specific device
348 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 349static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 350 u16 devid)
266{ 351{
@@ -278,12 +363,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 363 return 0;
279} 364}
280 365
366/****************************************************************************
367 *
368 * The next functions belong to the address allocator for the dma_ops
369 * interface functions. They work like the allocators in the other IOMMU
370 * drivers. Its basically a bitmap which marks the allocated pages in
371 * the aperture. Maybe it could be enhanced in the future to a more
372 * efficient allocator.
373 *
374 ****************************************************************************/
281static unsigned long dma_mask_to_pages(unsigned long mask) 375static unsigned long dma_mask_to_pages(unsigned long mask)
282{ 376{
283 return (mask >> PAGE_SHIFT) + 377 return (mask >> PAGE_SHIFT) +
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 378 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
285} 379}
286 380
381/*
382 * The address allocator core function.
383 *
384 * called with domain->lock held
385 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 386static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 387 struct dma_ops_domain *dom,
289 unsigned int pages) 388 unsigned int pages)
@@ -317,6 +416,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 416 return address;
318} 417}
319 418
419/*
420 * The address free function.
421 *
422 * called with domain->lock held
423 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 424static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 425 unsigned long address,
322 unsigned int pages) 426 unsigned int pages)
@@ -325,6 +429,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 429 iommu_area_free(dom->bitmap, address, pages);
326} 430}
327 431
432/****************************************************************************
433 *
434 * The next functions belong to the domain allocation. A domain is
435 * allocated for every IOMMU as the default domain. If device isolation
436 * is enabled, every device get its own domain. The most important thing
437 * about domains is the page table mapping the DMA address space they
438 * contain.
439 *
440 ****************************************************************************/
441
328static u16 domain_id_alloc(void) 442static u16 domain_id_alloc(void)
329{ 443{
330 unsigned long flags; 444 unsigned long flags;
@@ -342,6 +456,10 @@ static u16 domain_id_alloc(void)
342 return id; 456 return id;
343} 457}
344 458
459/*
460 * Used to reserve address ranges in the aperture (e.g. for exclusion
461 * ranges.
462 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 463static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 464 unsigned long start_page,
347 unsigned int pages) 465 unsigned int pages)
@@ -382,6 +500,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 500 free_page((unsigned long)p1);
383} 501}
384 502
503/*
504 * Free a domain, only used if something went wrong in the
505 * allocation path and we need to free an already allocated page table
506 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 507static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 508{
387 if (!dom) 509 if (!dom)
@@ -396,6 +518,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 518 kfree(dom);
397} 519}
398 520
521/*
522 * Allocates a new protection domain usable for the dma_ops functions.
523 * It also intializes the page table and the address allocator data
524 * structures required for the dma_ops interface
525 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 526static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 527 unsigned order)
401{ 528{
@@ -436,14 +563,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 563 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 564 dma_dom->next_bit = 0;
438 565
566 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 567 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 568 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 569 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
442 int pages = to_pages(iommu->exclusion_start, 570 int pages = iommu_num_pages(iommu->exclusion_start,
443 iommu->exclusion_length); 571 iommu->exclusion_length);
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 572 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 573 }
446 574
575 /*
576 * At the last step, build the page tables so we don't need to
577 * allocate page table pages in the dma_ops mapping/unmapping
578 * path.
579 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 580 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 581 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 582 GFP_KERNEL);
@@ -472,6 +605,10 @@ free_dma_dom:
472 return NULL; 605 return NULL;
473} 606}
474 607
608/*
609 * Find out the protection domain structure for a given PCI device. This
610 * will give us the pointer to the page table root for example.
611 */
475static struct protection_domain *domain_for_device(u16 devid) 612static struct protection_domain *domain_for_device(u16 devid)
476{ 613{
477 struct protection_domain *dom; 614 struct protection_domain *dom;
@@ -484,6 +621,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 621 return dom;
485} 622}
486 623
624/*
625 * If a device is not yet associated with a domain, this function does
626 * assigns it visible for the hardware
627 */
487static void set_device_domain(struct amd_iommu *iommu, 628static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 629 struct protection_domain *domain,
489 u16 devid) 630 u16 devid)
@@ -508,6 +649,19 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 649 iommu->need_sync = 1;
509} 650}
510 651
652/*****************************************************************************
653 *
654 * The next functions belong to the dma_ops mapping/unmapping code.
655 *
656 *****************************************************************************/
657
658/*
659 * In the dma_ops path we only have the struct device. This function
660 * finds the corresponding IOMMU, the protection domain and the
661 * requestor id for a given device.
662 * If the device is not yet associated with a domain this is also done
663 * in this function.
664 */
511static int get_device_resources(struct device *dev, 665static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 666 struct amd_iommu **iommu,
513 struct protection_domain **domain, 667 struct protection_domain **domain,
@@ -520,9 +674,10 @@ static int get_device_resources(struct device *dev,
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 674 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
521 675
522 pcidev = to_pci_dev(dev); 676 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 677 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 678
525 if (_bdf >= amd_iommu_last_bdf) { 679 /* device not translated by any IOMMU in the system? */
680 if (_bdf > amd_iommu_last_bdf) {
526 *iommu = NULL; 681 *iommu = NULL;
527 *domain = NULL; 682 *domain = NULL;
528 *bdf = 0xffff; 683 *bdf = 0xffff;
@@ -547,6 +702,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 702 return 1;
548} 703}
549 704
705/*
706 * This is the generic map function. It maps one 4kb page at paddr to
707 * the given address in the DMA address space for the domain.
708 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 709static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 710 struct dma_ops_domain *dom,
552 unsigned long address, 711 unsigned long address,
@@ -578,6 +737,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 737 return (dma_addr_t)address;
579} 738}
580 739
740/*
741 * The generic unmapping function for on page in the DMA address space.
742 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 743static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 744 struct dma_ops_domain *dom,
583 unsigned long address) 745 unsigned long address)
@@ -597,6 +759,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 759 *pte = 0ULL;
598} 760}
599 761
762/*
763 * This function contains common code for mapping of a physically
764 * contiguous memory region into DMA address space. It is uses by all
765 * mapping functions provided by this IOMMU driver.
766 * Must be called with the domain lock held.
767 */
600static dma_addr_t __map_single(struct device *dev, 768static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 769 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 770 struct dma_ops_domain *dma_dom,
@@ -609,7 +777,7 @@ static dma_addr_t __map_single(struct device *dev,
609 unsigned int pages; 777 unsigned int pages;
610 int i; 778 int i;
611 779
612 pages = to_pages(paddr, size); 780 pages = iommu_num_pages(paddr, size);
613 paddr &= PAGE_MASK; 781 paddr &= PAGE_MASK;
614 782
615 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 783 address = dma_ops_alloc_addresses(dev, dma_dom, pages);
@@ -628,6 +796,10 @@ out:
628 return address; 796 return address;
629} 797}
630 798
799/*
800 * Does the reverse of the __map_single function. Must be called with
801 * the domain lock held too
802 */
631static void __unmap_single(struct amd_iommu *iommu, 803static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 804 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 805 dma_addr_t dma_addr,
@@ -640,7 +812,7 @@ static void __unmap_single(struct amd_iommu *iommu,
640 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 812 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
641 return; 813 return;
642 814
643 pages = to_pages(dma_addr, size); 815 pages = iommu_num_pages(dma_addr, size);
644 dma_addr &= PAGE_MASK; 816 dma_addr &= PAGE_MASK;
645 start = dma_addr; 817 start = dma_addr;
646 818
@@ -652,6 +824,9 @@ static void __unmap_single(struct amd_iommu *iommu,
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 824 dma_ops_free_addresses(dma_dom, dma_addr, pages);
653} 825}
654 826
827/*
828 * The exported map_single function for dma_ops.
829 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 830static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 831 size_t size, int dir)
657{ 832{
@@ -664,6 +839,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
664 get_device_resources(dev, &iommu, &domain, &devid); 839 get_device_resources(dev, &iommu, &domain, &devid);
665 840
666 if (iommu == NULL || domain == NULL) 841 if (iommu == NULL || domain == NULL)
842 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 843 return (dma_addr_t)paddr;
668 844
669 spin_lock_irqsave(&domain->lock, flags); 845 spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +859,9 @@ out:
683 return addr; 859 return addr;
684} 860}
685 861
862/*
863 * The exported unmap_single function for dma_ops.
864 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 865static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 866 size_t size, int dir)
688{ 867{
@@ -692,6 +871,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
692 u16 devid; 871 u16 devid;
693 872
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 873 if (!get_device_resources(dev, &iommu, &domain, &devid))
874 /* device not handled by any AMD IOMMU */
695 return; 875 return;
696 876
697 spin_lock_irqsave(&domain->lock, flags); 877 spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +886,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
706 spin_unlock_irqrestore(&domain->lock, flags); 886 spin_unlock_irqrestore(&domain->lock, flags);
707} 887}
708 888
889/*
890 * This is a special map_sg function which is used if we should map a
891 * device which is not handled by an AMD IOMMU in the system.
892 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 893static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 894 int nelems, int dir)
711{ 895{
@@ -720,6 +904,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 904 return nelems;
721} 905}
722 906
907/*
908 * The exported map_sg function for dma_ops (handles scatter-gather
909 * lists).
910 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 911static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 912 int nelems, int dir)
725{ 913{
@@ -775,6 +963,10 @@ unmap:
775 goto out; 963 goto out;
776} 964}
777 965
966/*
967 * The exported map_sg function for dma_ops (handles scatter-gather
968 * lists).
969 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 970static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 971 int nelems, int dir)
780{ 972{
@@ -804,6 +996,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
804 spin_unlock_irqrestore(&domain->lock, flags); 996 spin_unlock_irqrestore(&domain->lock, flags);
805} 997}
806 998
999/*
1000 * The exported alloc_coherent function for dma_ops.
1001 */
807static void *alloc_coherent(struct device *dev, size_t size, 1002static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 1003 dma_addr_t *dma_addr, gfp_t flag)
809{ 1004{
@@ -851,6 +1046,11 @@ out:
851 return virt_addr; 1046 return virt_addr;
852} 1047}
853 1048
1049/*
1050 * The exported free_coherent function for dma_ops.
1051 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1052 * function.
1053 */
854static void free_coherent(struct device *dev, size_t size, 1054static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1055 void *virt_addr, dma_addr_t dma_addr)
856{ 1056{
@@ -879,6 +1079,8 @@ free_mem:
879} 1079}
880 1080
881/* 1081/*
1082 * The function for pre-allocating protection domains.
1083 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1084 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1085 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1086 * For now we have to.
@@ -893,7 +1095,7 @@ void prealloc_protection_domains(void)
893 1095
894 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1096 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
895 devid = (dev->bus->number << 8) | dev->devfn; 1097 devid = (dev->bus->number << 8) | dev->devfn;
896 if (devid >= amd_iommu_last_bdf) 1098 if (devid > amd_iommu_last_bdf)
897 continue; 1099 continue;
898 devid = amd_iommu_alias_table[devid]; 1100 devid = amd_iommu_alias_table[devid];
899 if (domain_for_device(devid)) 1101 if (domain_for_device(devid))
@@ -921,12 +1123,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
921 .unmap_sg = unmap_sg, 1123 .unmap_sg = unmap_sg,
922}; 1124};
923 1125
1126/*
1127 * The function which clues the AMD IOMMU driver into dma_ops.
1128 */
924int __init amd_iommu_init_dma_ops(void) 1129int __init amd_iommu_init_dma_ops(void)
925{ 1130{
926 struct amd_iommu *iommu; 1131 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1132 int order = amd_iommu_aperture_order;
928 int ret; 1133 int ret;
929 1134
1135 /*
1136 * first allocate a default protection domain for every IOMMU we
1137 * found in the system. Devices not assigned to any other
1138 * protection domain will be assigned to the default one.
1139 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1140 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1141 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1142 if (iommu->default_dom == NULL)
@@ -936,6 +1146,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1146 goto free_domains;
937 } 1147 }
938 1148
1149 /*
1150 * If device isolation is enabled, pre-allocate the protection
1151 * domains for each device.
1152 */
939 if (amd_iommu_isolate) 1153 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1154 prealloc_protection_domains();
941 1155
@@ -947,6 +1161,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1161 gart_iommu_aperture = 0;
948#endif 1162#endif
949 1163
1164 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1165 dma_ops = &amd_iommu_dma_ops;
951 1166
952 return 0; 1167 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..a69cc0f52042 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 27#include <asm/amd_iommu.h>
28#include <asm/gart.h> 28#include <asm/iommu.h>
29 29
30/* 30/*
31 * definitions for the ACPI scanning code 31 * definitions for the ACPI scanning code
32 */ 32 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff) 33#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 34#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 35
43#define ACPI_IVHD_TYPE 0x10 36#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 37#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +64,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 64#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 65#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
74struct ivhd_header { 78struct ivhd_header {
75 u8 type; 79 u8 type;
76 u8 flags; 80 u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
83 u32 reserved; 87 u32 reserved;
84} __attribute__((packed)); 88} __attribute__((packed));
85 89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
86struct ivhd_entry { 94struct ivhd_entry {
87 u8 type; 95 u8 type;
88 u16 devid; 96 u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
90 u32 ext; 98 u32 ext;
91} __attribute__((packed)); 99} __attribute__((packed));
92 100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
93struct ivmd_header { 105struct ivmd_header {
94 u8 type; 106 u8 type;
95 u8 flags; 107 u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
103 115
104static int __initdata amd_iommu_detected; 116static int __initdata amd_iommu_detected;
105 117
106u16 amd_iommu_last_bdf; 118u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 119 to handle */
108unsigned amd_iommu_aperture_order = 26; 120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
110 127
111struct list_head amd_iommu_list; 128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
112struct dev_table_entry *amd_iommu_dev_table; 134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
113u16 *amd_iommu_alias_table; 141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
114struct amd_iommu **amd_iommu_rlookup_table; 147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
115struct protection_domain **amd_iommu_pd_table; 153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 159unsigned long *amd_iommu_pd_alloc_bitmap;
117 160
118static u32 dev_table_size; 161static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 162static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 163static u32 rlookup_table_size; /* size if the rlookup table */
164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
121 187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 193{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 194 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 207 &entry, sizeof(entry));
138} 208}
139 209
210/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 211static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 212{
142 u32 entry; 213 u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 220 &entry, sizeof(entry));
150} 221}
151 222
223/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 225{
154 u32 ctrl; 226 u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 240}
169 241
242/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 243void __init iommu_enable(struct amd_iommu *iommu)
171{ 244{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 250}
178 251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
179static u8 * __init iommu_map_mmio_space(u64 address) 256static u8 * __init iommu_map_mmio_space(u64 address)
180{ 257{
181 u8 *ret; 258 u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 277}
201 278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 293{
204 u32 cap; 294 u32 cap;
205 295
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 298
209 return 0; 299 return 0;
210} 300}
211 301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 307{
214 u8 *p = (void *)h, *end = (void *)h; 308 u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 323 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 324 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 325 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
233 break; 328 break;
234 default: 329 default:
235 break; 330 break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 337 return 0;
243} 338}
244 339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 345static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 346{
247 int i; 347 int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 377 return 0;
278} 378}
279 379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 395{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 397 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 398 u64 entry;
285 399
286 if (cmd_buf == NULL) 400 if (cmd_buf == NULL)
287 return NULL; 401 return NULL;
288 402
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 404
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 405 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 406 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 414
303static void __init free_command_buffer(struct amd_iommu *iommu) 415static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 416{
305 if (iommu->cmd_buf) 417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
306 free_pages((unsigned long)iommu->cmd_buf,
307 get_order(CMD_BUFFER_SIZE));
308} 418}
309 419
420/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 421static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 422{
312 int i = (bit >> 5) & 0x07; 423 int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 427}
317 428
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
319{ 441{
320 if (flags & ACPI_DEVFLAG_INITPASS) 442 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 454 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 456
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 457 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 458}
340 459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 465{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 469 return;
347 470
348 if (iommu) { 471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 478 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 479 iommu->exclusion_length = m->range_length;
352 } 480 }
353} 481}
354 482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 489{
357 int bus = PCI_BUS(iommu->devid); 490 int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
364 497
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
368} 503}
369 504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 510 struct ivhd_header *h)
372{ 511{
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 513 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 515 u32 ext_flags = 0;
377 bool alias = 0; 516 bool alias = false;
378 struct ivhd_entry *e; 517 struct ivhd_entry *e;
379 518
380 /* 519 /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 553 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 554 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 555 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
418 break; 558 break;
419 case IVHD_DEV_SELECT: 559 case IVHD_DEV_SELECT:
420 devid = e->devid; 560 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 562 break;
423 case IVHD_DEV_SELECT_RANGE_START: 563 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 564 devid_start = e->devid;
425 flags = e->flags; 565 flags = e->flags;
426 ext_flags = 0; 566 ext_flags = 0;
427 alias = 0; 567 alias = false;
428 break; 568 break;
429 case IVHD_DEV_ALIAS: 569 case IVHD_DEV_ALIAS:
430 devid = e->devid; 570 devid = e->devid;
431 devid_to = e->ext >> 8; 571 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 573 amd_iommu_alias_table[devid] = devid_to;
434 break; 574 break;
435 case IVHD_DEV_ALIAS_RANGE: 575 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 577 flags = e->flags;
438 devid_to = e->ext >> 8; 578 devid_to = e->ext >> 8;
439 ext_flags = 0; 579 ext_flags = 0;
440 alias = 1; 580 alias = true;
441 break; 581 break;
442 case IVHD_DEV_EXT_SELECT: 582 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 583 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
445 break; 586 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 587 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 588 devid_start = e->devid;
448 flags = e->flags; 589 flags = e->flags;
449 ext_flags = e->ext; 590 ext_flags = e->ext;
450 alias = 0; 591 alias = false;
451 break; 592 break;
452 case IVHD_DEV_RANGE_END: 593 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 594 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 596 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 597 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 598 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 599 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 600 flags, ext_flags);
460 } 601 }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
467 } 608 }
468} 609}
469 610
611/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 612static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 613{
472 u16 i; 614 u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
494 } 636 }
495} 637}
496 638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 645{
499 spin_lock_init(&iommu->lock); 646 spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
521 return 0; 668 return 0;
522} 669}
523 670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
524static int __init init_iommu_all(struct acpi_table_header *table) 675static int __init init_iommu_all(struct acpi_table_header *table)
525{ 676{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 677 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 679 struct amd_iommu *iommu;
529 int ret; 680 int ret;
530 681
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 682 end += table->length;
534 p += IVRS_HEADER_LENGTH; 683 p += IVRS_HEADER_LENGTH;
535 684
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 704 return 0;
556} 705}
557 706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
558static void __init free_unity_maps(void) 715static void __init free_unity_maps(void)
559{ 716{
560 struct unity_map_entry *entry, *next; 717 struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
565 } 722 }
566} 723}
567 724
725/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 726static int __init init_exclusion_range(struct ivmd_header *m)
569{ 727{
570 int i; 728 int i;
@@ -574,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
574 set_device_exclusion_range(m->devid, m); 732 set_device_exclusion_range(m->devid, m);
575 break; 733 break;
576 case ACPI_IVMD_TYPE_ALL: 734 case ACPI_IVMD_TYPE_ALL:
577 for (i = 0; i < amd_iommu_last_bdf; ++i) 735 for (i = 0; i <= amd_iommu_last_bdf; ++i)
578 set_device_exclusion_range(i, m); 736 set_device_exclusion_range(i, m);
579 break; 737 break;
580 case ACPI_IVMD_TYPE_RANGE: 738 case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 746 return 0;
589} 747}
590 748
749/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 750static int __init init_unity_map_range(struct ivmd_header *m)
592{ 751{
593 struct unity_map_entry *e = 0; 752 struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 778 return 0;
620} 779}
621 780
781/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 782static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 783{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 784 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 785 struct ivmd_header *m;
626 786
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 787 end += table->length;
630 p += IVRS_HEADER_LENGTH; 788 p += IVRS_HEADER_LENGTH;
631 789
@@ -642,6 +800,25 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 800 return 0;
643} 801}
644 802
803/*
804 * Init the device table to not allow DMA access for devices and
805 * suppress all page faults
806 */
807static void init_device_table(void)
808{
809 u16 devid;
810
811 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
812 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
813 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
814 set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
815 }
816}
817
818/*
819 * This function finally enables all IOMMUs found in the system after
820 * they have been initialized
821 */
645static void __init enable_iommus(void) 822static void __init enable_iommus(void)
646{ 823{
647 struct amd_iommu *iommu; 824 struct amd_iommu *iommu;
@@ -678,6 +855,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 855 .cls = &amd_iommu_sysdev_class,
679}; 856};
680 857
858/*
859 * This is the core init function for AMD IOMMU hardware in the system.
860 * This function is called from the generic x86 DMA layer initialization
861 * code.
862 *
863 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
864 * three times:
865 *
866 * 1 pass) Find the highest PCI device id the driver has to handle.
867 * Upon this information the size of the data structures is
868 * determined that needs to be allocated.
869 *
870 * 2 pass) Initialize the data structures just allocated with the
871 * information in the ACPI table about available AMD IOMMUs
872 * in the system. It also maps the PCI devices in the
873 * system to specific IOMMUs
874 *
875 * 3 pass) After the basic data structures are allocated and
876 * initialized we update them with information about memory
877 * remapping requirements parsed out of the ACPI table in
878 * this last pass.
879 *
880 * After that the hardware is initialized and ready to go. In the last
881 * step we do some Linux specific things like registering the driver in
882 * the dma_ops interface and initializing the suspend/resume support
883 * functions. Finally it prints some information about AMD IOMMUs and
884 * the driver state and enables the hardware.
885 */
681int __init amd_iommu_init(void) 886int __init amd_iommu_init(void)
682{ 887{
683 int i, ret = 0; 888 int i, ret = 0;
@@ -699,14 +904,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 904 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 905 return -ENODEV;
701 906
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 907 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 908 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 909 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 910
706 ret = -ENOMEM; 911 ret = -ENOMEM;
707 912
708 /* Device table - directly used by all IOMMUs */ 913 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 914 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 915 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 916 if (amd_iommu_dev_table == NULL)
712 goto out; 917 goto out;
@@ -730,27 +935,26 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 935 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 936 * This table has the same size as the rlookup_table
732 */ 937 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 938 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 939 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 940 if (amd_iommu_pd_table == NULL)
736 goto free; 941 goto free;
737 942
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 943 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
944 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 945 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 946 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 947 goto free;
742 948
949 /* init the device table */
950 init_device_table();
951
743 /* 952 /*
744 * memory is allocated now; initialize the device table with all zeroes 953 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 954 */
747 memset(amd_iommu_dev_table, 0, dev_table_size); 955 for (i = 0; i <= amd_iommu_last_bdf; ++i)
748 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 956 amd_iommu_alias_table[i] = i;
750 957
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 958 /*
755 * never allocate domain 0 because its used as the non-allocated and 959 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 960 * error value placeholder
@@ -768,15 +972,15 @@ int __init amd_iommu_init(void)
768 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 972 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
769 goto free; 973 goto free;
770 974
771 ret = amd_iommu_init_dma_ops(); 975 ret = sysdev_class_register(&amd_iommu_sysdev_class);
772 if (ret) 976 if (ret)
773 goto free; 977 goto free;
774 978
775 ret = sysdev_class_register(&amd_iommu_sysdev_class); 979 ret = sysdev_register(&device_amd_iommu);
776 if (ret) 980 if (ret)
777 goto free; 981 goto free;
778 982
779 ret = sysdev_register(&device_amd_iommu); 983 ret = amd_iommu_init_dma_ops();
780 if (ret) 984 if (ret)
781 goto free; 985 goto free;
782 986
@@ -795,24 +999,19 @@ out:
795 return ret; 999 return ret;
796 1000
797free: 1001free:
798 if (amd_iommu_pd_alloc_bitmap) 1002 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
800 1003
801 if (amd_iommu_pd_table) 1004 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 1005 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 1006
805 if (amd_iommu_rlookup_table) 1007 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 1008 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 1009
809 if (amd_iommu_alias_table) 1010 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 1011 get_order(alias_table_size));
811 get_order(alias_table_size));
812 1012
813 if (amd_iommu_dev_table) 1013 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 1014 get_order(dev_table_size));
815 get_order(dev_table_size));
816 1015
817 free_iommu_all(); 1016 free_iommu_all();
818 1017
@@ -821,6 +1020,13 @@ free:
821 goto out; 1020 goto out;
822} 1021}
823 1022
1023/****************************************************************************
1024 *
1025 * Early detect code. This code runs at IOMMU detection time in the DMA
1026 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1027 * IOMMUs
1028 *
1029 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1030static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1031{
826 return 0; 1032 return 0;
@@ -828,7 +1034,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1034
829void __init amd_iommu_detect(void) 1035void __init amd_iommu_detect(void)
830{ 1036{
831 if (swiotlb || no_iommu || iommu_detected) 1037 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1038 return;
833 1039
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1040 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1047,13 @@ void __init amd_iommu_detect(void)
841 } 1047 }
842} 1048}
843 1049
1050/****************************************************************************
1051 *
1052 * Parsing functions for the AMD IOMMU specific kernel command line
1053 * options.
1054 *
1055 ****************************************************************************/
1056
844static int __init parse_amd_iommu_options(char *str) 1057static int __init parse_amd_iommu_options(char *str)
845{ 1058{
846 for (; *str; ++str) { 1059 for (; *str; ++str) {
@@ -853,20 +1066,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1066
854static int __init parse_amd_iommu_size_options(char *str) 1067static int __init parse_amd_iommu_size_options(char *str)
855{ 1068{
856 for (; *str; ++str) { 1069 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1070
858 amd_iommu_aperture_order = 25; 1071 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1072 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1073
871 return 1; 1074 return 1;
872} 1075}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..44e21826db11 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a437d027f20b..f88bd0d982b0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 75/*
76 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
77 */ 77 */
78int apic_verbosity; 78unsigned int apic_verbosity;
79 79
80int pic_mode; 80int pic_mode;
81 81
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
177 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
178 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
181} 181}
182 182
183/** 183/**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
215 *
216 * We do reads before writes even if unnecessary, to get around the
217 * P5 APIC double write bug.
218 */ 215 */
219static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
220{ 217{
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 226 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
231 228
232 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
233 230
234 /* 231 /*
235 * Divide PICLK by 16 232 * Divide PICLK by 16
236 */ 233 */
237 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
241 238
242 if (!oneshot) 239 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 241}
245 242
246/* 243/*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 247 struct clock_event_device *evt)
251{ 248{
252 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
253 return 0; 250 return 0;
254} 251}
255 252
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
282 break; 279 break;
283 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 369 }
373} 370}
374 371
375/* 372static int __init calibrate_APIC_clock(void)
376 * Setup the boot APIC
377 *
378 * Calibrate and verify the result.
379 */
380void __init setup_boot_APIC_clock(void)
381{ 373{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
387 long delta, deltapm; 379 long delta, deltapm;
388 int pm_referenced = 0; 380 int pm_referenced = 0;
389 381
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 382 local_irq_disable();
409 383
410 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
491 465
492 local_apic_timer_verify_ok = 1;
493
494 /* 466 /*
495 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
496 */ 468 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 470 local_irq_enable();
499 printk(KERN_WARNING 471 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 473 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 474 }
506 475
476 local_apic_timer_verify_ok = 1;
477
507 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
508 if (!pm_referenced) { 479 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
543 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
544 printk(KERN_WARNING 515 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
546 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
548 return; 539 lapic_clockevent.mult = 1;
549 } else { 540 setup_APIC_timer();
550 /* 541 }
551 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy
553 * device.
554 */
555 if (nmi_watchdog != NMI_IO_APIC)
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
557 else
558 printk(KERN_WARNING "APIC timer registered as dummy,"
559 " due to nmi_watchdog=%d!\n", nmi_watchdog);
560 } 543 }
561 544
545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
546 "calibrating APIC timer ...\n");
547
548 if (calibrate_APIC_clock()) {
549 /* No broadcast on UP ! */
550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
553 }
554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
562 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 567 setup_APIC_timer();
564} 568}
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
693 */ 697 */
694 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 701 }
698 /* 702 /*
699 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 704 * any level-triggered sources.
701 */ 705 */
702 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 715 }
712 716
713 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 722 }
719#endif 723#endif
720 /* 724 /*
721 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
722 */ 726 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 730 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 732 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 734
731#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5) 736 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif 738#endif
735 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
756 */ 760 */
757 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
760 764
761 /* 765 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
865 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
866 870
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
869 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 874}
871 875
872/* 876/*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
902 else 906 else
903 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
906 910
907 /* 911 /*
908 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
909 */ 913 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
915} 919}
916 920
917static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
926 930
927 /* enables sending errors */ 931 /* enables sending errors */
928 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
929 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
930 /* 934 /*
931 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
932 */ 936 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 993 */
990 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
993 997
994 /* 998 /*
995 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1047 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1048 */ 1052 */
1049 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1051 1055
1052 /* 1056 /*
1053 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1074 smp_processor_id());
1071 } 1075 }
1072 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1073 1077
1074 /* 1078 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1084} 1088}
1085 1089
1086void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1091 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1092 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1094 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1095 1099
1096 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
1214 1218
1215int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1216{ 1220{
1217 if (disable_apic)
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1219
1220 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1222 return -1;
1222 1223
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1419 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1420 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1421 value |= 0xf; 1422 value |= 0xf;
1422 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1423 1424
1424 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1425 /* 1426 /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1436 } else { 1437 } else {
1437 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1439 } 1440 }
1440 1441
1441 /* 1442 /*
@@ -1449,12 +1450,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1452 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1453 } 1454 }
1454} 1455}
1455 1456
1456unsigned int __cpuinitdata maxcpus = NR_CPUS;
1457
1458void __cpuinit generic_processor_info(int apicid, int version) 1457void __cpuinit generic_processor_info(int apicid, int version)
1459{ 1458{
1460 int cpu; 1459 int cpu;
@@ -1481,12 +1480,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1481 return; 1480 return;
1482 } 1481 }
1483 1482
1484 if (num_processors >= maxcpus) {
1485 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1486 " Processor ignored.\n", maxcpus);
1487 return;
1488 }
1489
1490 num_processors++; 1483 num_processors++;
1491 cpus_complement(tmp_map, cpu_present_map); 1484 cpus_complement(tmp_map, cpu_present_map);
1492 cpu = first_cpu(tmp_map); 1485 cpu = first_cpu(tmp_map);
@@ -1700,7 +1693,7 @@ early_param("lapic", parse_lapic);
1700static int __init parse_nolapic(char *arg) 1693static int __init parse_nolapic(char *arg)
1701{ 1694{
1702 disable_apic = 1; 1695 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1696 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 1697 return 0;
1705} 1698}
1706early_param("nolapic", parse_nolapic); 1699early_param("nolapic", parse_nolapic);
@@ -1719,15 +1712,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1719} 1712}
1720early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1713early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1721 1714
1722static int __init apic_set_verbosity(char *str) 1715static int __init apic_set_verbosity(char *arg)
1723{ 1716{
1724 if (strcmp("debug", str) == 0) 1717 if (!arg)
1718 return -EINVAL;
1719
1720 if (strcmp(arg, "debug") == 0)
1725 apic_verbosity = APIC_DEBUG; 1721 apic_verbosity = APIC_DEBUG;
1726 else if (strcmp("verbose", str) == 0) 1722 else if (strcmp(arg, "verbose") == 0)
1727 apic_verbosity = APIC_VERBOSE; 1723 apic_verbosity = APIC_VERBOSE;
1728 return 1; 1724
1725 return 0;
1729} 1726}
1730__setup("apic=", apic_set_verbosity); 1727early_param("apic", apic_set_verbosity);
1731 1728
1732static int __init lapic_insert_resource(void) 1729static int __init lapic_insert_resource(void)
1733{ 1730{
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1e3d32e27c14..446c062e831c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58 58
59/* Have we found an MP table */ 59/* Have we found an MP table */
60int smp_found_config; 60int smp_found_config;
@@ -90,7 +90,6 @@ static unsigned long apic_phys;
90 90
91unsigned long mp_lapic_addr; 91unsigned long mp_lapic_addr;
92 92
93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/* 93/*
95 * Get the LAPIC version 94 * Get the LAPIC version
96 */ 95 */
@@ -314,7 +313,7 @@ static void setup_APIC_timer(void)
314 313
315#define TICK_COUNT 100000000 314#define TICK_COUNT 100000000
316 315
317static void __init calibrate_APIC_clock(void) 316static int __init calibrate_APIC_clock(void)
318{ 317{
319 unsigned apic, apic_start; 318 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 319 unsigned long tsc, tsc_start;
@@ -368,6 +367,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 367 clockevent_delta2ns(0xF, &lapic_clockevent);
369 368
370 calibration_result = result / HZ; 369 calibration_result = result / HZ;
370
371 /*
372 * Do a sanity check on the APIC calibration result
373 */
374 if (calibration_result < (1000000 / HZ)) {
375 printk(KERN_WARNING
376 "APIC frequency too slow, disabling apic timer\n");
377 return -1;
378 }
379
380 return 0;
371} 381}
372 382
373/* 383/*
@@ -394,14 +404,7 @@ void __init setup_boot_APIC_clock(void)
394 } 404 }
395 405
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 406 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 407 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 408 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 409 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 410 setup_APIC_timer();
@@ -1058,12 +1061,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1058 return; 1061 return;
1059 } 1062 }
1060 1063
1061 if (num_processors >= maxcpus) {
1062 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1063 " Processor ignored.\n", maxcpus);
1064 return;
1065 }
1066
1067 num_processors++; 1064 num_processors++;
1068 cpus_complement(tmp_map, cpu_present_map); 1065 cpus_complement(tmp_map, cpu_present_map);
1069 cpu = first_cpu(tmp_map); 1066 cpu = first_cpu(tmp_map);
@@ -1337,7 +1334,7 @@ early_param("apic", apic_set_verbosity);
1337static __init int setup_disableapic(char *str) 1334static __init int setup_disableapic(char *str)
1338{ 1335{
1339 disable_apic = 1; 1336 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1337 setup_clear_cpu_cap(X86_FEATURE_APIC);
1341 return 0; 1338 return 0;
1342} 1339}
1343early_param("disableapic", setup_disableapic); 1340early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..732d1f4e10ee 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
@@ -235,6 +234,7 @@
235#include <asm/uaccess.h> 234#include <asm/uaccess.h>
236#include <asm/desc.h> 235#include <asm/desc.h>
237#include <asm/i8253.h> 236#include <asm/i8253.h>
237#include <asm/olpc.h>
238#include <asm/paravirt.h> 238#include <asm/paravirt.h>
239#include <asm/reboot.h> 239#include <asm/reboot.h>
240 240
@@ -2218,7 +2218,7 @@ static int __init apm_init(void)
2218 2218
2219 dmi_check_system(apm_dmi_table); 2219 dmi_check_system(apm_dmi_table);
2220 2220
2221 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2221 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2222 printk(KERN_INFO "apm: BIOS not found.\n"); 2222 printk(KERN_INFO "apm: BIOS not found.\n");
2223 return -ENODEV; 2223 return -ENODEV;
2224 } 2224 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..c639bd55391c
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..a6ef672adbba 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -56,9 +56,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
56 56
57 switch (c->x86_vendor) { 57 switch (c->x86_vendor) {
58 case X86_VENDOR_INTEL: 58 case X86_VENDOR_INTEL:
59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 59 /*
60 * There is a known erratum on Pentium III and Core Solo
61 * and Core Duo CPUs.
62 * " Page with PAT set to WC while associated MTRR is UC
63 * may consolidate to UC "
64 * Because of this erratum, it is better to stick with
65 * setting WC in MTRR rather than using PAT on these CPUs.
66 *
67 * Enable PAT WC only on P4, Core 2 or later CPUs.
68 */
69 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
60 return; 70 return;
61 break; 71
72 pat_disable("PAT WC disabled due to known CPU erratum.");
73 return;
74
62 case X86_VENDOR_AMD: 75 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR: 76 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA: 77 case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..18514ed26104 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27int force_mwait __cpuinitdata;
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 28{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
@@ -33,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
33 if (c->x86_power & (1<<8)) 31 if (c->x86_power & (1<<8))
34 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
35 } 33 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
36} 39}
37 40
38static void __cpuinit init_amd(struct cpuinfo_x86 *c) 41static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -168,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
168 mbytes); 171 mbytes);
169 } 172 }
170 173
171 /* Set MTRR capability flag if appropriate */
172 if (c->x86_model == 13 || c->x86_model == 9 ||
173 (c->x86_model == 8 && c->x86_mask >= 8))
174 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
175 break; 174 break;
176 } 175 }
177 176
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
74 "fistpl %0\n\t" 76 "fistpl %0\n\t"
75 "fwait\n\t" 77 "fwait\n\t"
76 "fninit" 78 "fninit"
77 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
78 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
79 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
80 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
81} 85}
@@ -131,13 +135,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 135 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 136 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 137 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 138 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 139
142static void __init check_config(void) 140static void __init check_config(void)
143{ 141{
@@ -151,21 +149,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 149 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 150 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 151#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 152}
170 153
171 154
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..a0534c04d38a 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -314,6 +314,16 @@ enum {
314 EAMD3D = 1<<20, 314 EAMD3D = 1<<20,
315}; 315};
316 316
317static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
318{
319 switch (c->x86) {
320 case 5:
321 /* Emulate MTRRs using Centaur's MCR. */
322 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
323 break;
324 }
325}
326
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 327static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 328{
319 329
@@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 472static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 473 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 474 .c_ident = { "CentaurHauls" },
475 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 476 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 477 .c_size_cache = centaur_size_cache,
467}; 478};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..4e456bd955bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <asm/mtrr.h> 13#include <asm/mtrr.h>
14#include <asm/mce.h> 14#include <asm/mce.h>
15#include <asm/pat.h> 15#include <asm/pat.h>
16#include <asm/asm.h>
16#ifdef CONFIG_X86_LOCAL_APIC 17#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 18#include <asm/mpspec.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
@@ -334,11 +335,24 @@ static void __init early_cpu_detect(void)
334 335
335 get_cpu_vendor(c, 1); 336 get_cpu_vendor(c, 1);
336 337
338 early_get_cap(c);
339
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 340 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
338 cpu_devs[c->x86_vendor]->c_early_init) 341 cpu_devs[c->x86_vendor]->c_early_init)
339 cpu_devs[c->x86_vendor]->c_early_init(c); 342 cpu_devs[c->x86_vendor]->c_early_init(c);
343}
340 344
341 early_get_cap(c); 345/*
346 * The NOPL instruction is supposed to exist on all CPUs with
347 * family >= 6; unfortunately, that's not true in practice because
348 * of early VIA chips and (more importantly) broken virtualizers that
349 * are not easy to detect. In the latter case it doesn't even *fail*
350 * reliably, so probing for it doesn't even work. Disable it completely
351 * unless we can find a reliable way to detect all the broken cases.
352 */
353static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
354{
355 clear_cpu_cap(c, X86_FEATURE_NOPL);
342} 356}
343 357
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 358static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -395,8 +409,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
395 } 409 }
396 410
397 init_scattered_cpuid_features(c); 411 init_scattered_cpuid_features(c);
412 detect_nopl(c);
398 } 413 }
399
400} 414}
401 415
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 416static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..a11f5d4477cd 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,19 +7,18 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kgdb.h> 8#include <linux/kgdb.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h> 12#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h> 13#include <asm/i387.h>
17#include <asm/msr.h> 14#include <asm/msr.h>
18#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
19#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 18#include <asm/mtrr.h>
21#include <asm/mce.h> 19#include <asm/mce.h>
22#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
23#include <asm/numa.h> 22#include <asm/numa.h>
24#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 24#include <asm/mpspec.h>
@@ -217,6 +216,39 @@ static void __init early_cpu_support_print(void)
217 } 216 }
218} 217}
219 218
219/*
220 * The NOPL instruction is supposed to exist on all CPUs with
221 * family >= 6, unfortunately, that's not true in practice because
222 * of early VIA chips and (more importantly) broken virtualizers that
223 * are not easy to detect. Hence, probe for it based on first
224 * principles.
225 *
226 * Note: no 64-bit chip is known to lack these, but put the code here
227 * for consistency with 32 bits, and to make it utterly trivial to
228 * diagnose the problem should it ever surface.
229 */
230static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
231{
232 const u32 nopl_signature = 0x888c53b1; /* Random number */
233 u32 has_nopl = nopl_signature;
234
235 clear_cpu_cap(c, X86_FEATURE_NOPL);
236 if (c->x86 >= 6) {
237 asm volatile("\n"
238 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
239 "2:\n"
240 " .section .fixup,\"ax\"\n"
241 "3: xor %0,%0\n"
242 " jmp 2b\n"
243 " .previous\n"
244 _ASM_EXTABLE(1b,3b)
245 : "+a" (has_nopl));
246
247 if (has_nopl == nopl_signature)
248 set_cpu_cap(c, X86_FEATURE_NOPL);
249 }
250}
251
220static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); 252static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
221 253
222void __init early_cpu_init(void) 254void __init early_cpu_init(void)
@@ -305,7 +337,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
305 c->x86_capability[2] = cpuid_edx(0x80860001); 337 c->x86_capability[2] = cpuid_edx(0x80860001);
306 } 338 }
307 339
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007) 340 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007); 341 c->x86_power = cpuid_edx(0x80000007);
311 342
@@ -316,18 +347,13 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 347 c->x86_phys_bits = eax & 0xff;
317 } 348 }
318 349
319 /* Assume all 64-bit CPUs support 32-bit syscall */ 350 detect_nopl(c);
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321 351
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 352 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 353 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 354 cpu_devs[c->x86_vendor]->c_early_init(c);
325 355
326 validate_pat_support(c); 356 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331} 357}
332 358
333/* 359/*
@@ -503,22 +529,24 @@ void pda_init(int cpu)
503 /* others are initialized in smpboot.c */ 529 /* others are initialized in smpboot.c */
504 pda->pcurrent = &init_task; 530 pda->pcurrent = &init_task;
505 pda->irqstackptr = boot_cpu_stack; 531 pda->irqstackptr = boot_cpu_stack;
532 pda->irqstackptr += IRQSTACKSIZE - 64;
506 } else { 533 } else {
507 pda->irqstackptr = (char *) 534 if (!pda->irqstackptr) {
508 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); 535 pda->irqstackptr = (char *)
509 if (!pda->irqstackptr) 536 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
510 panic("cannot allocate irqstack for cpu %d", cpu); 537 if (!pda->irqstackptr)
538 panic("cannot allocate irqstack for cpu %d",
539 cpu);
540 pda->irqstackptr += IRQSTACKSIZE - 64;
541 }
511 542
512 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 543 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
513 pda->nodenumber = cpu_to_node(cpu); 544 pda->nodenumber = cpu_to_node(cpu);
514 } 545 }
515
516 pda->irqstackptr += IRQSTACKSIZE-64;
517} 546}
518 547
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 548char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 549 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 550
523extern asmlinkage void ignore_sysret(void); 551extern asmlinkage void ignore_sysret(void);
524 552
@@ -612,19 +640,22 @@ void __cpuinit cpu_init(void)
612 /* 640 /*
613 * set up and load the per-CPU TSS 641 * set up and load the per-CPU TSS
614 */ 642 */
615 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 643 if (!orig_ist->ist[0]) {
616 static const unsigned int order[N_EXCEPTION_STACKS] = { 644 static const unsigned int order[N_EXCEPTION_STACKS] = {
617 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 645 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
618 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 646 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
619 }; 647 };
620 if (cpu) { 648 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
621 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); 649 if (cpu) {
622 if (!estacks) 650 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
623 panic("Cannot allocate exception stack %ld %d\n", 651 if (!estacks)
624 v, cpu); 652 panic("Cannot allocate exception "
653 "stack %ld %d\n", v, cpu);
654 }
655 estacks += PAGE_SIZE << order[v];
656 orig_ist->ist[v] = t->x86_tss.ist[v] =
657 (unsigned long)estacks;
625 } 658 }
626 estacks += PAGE_SIZE << order[v];
627 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
628 } 659 }
629 660
630 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 661 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..dd097b835839 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
@@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 451
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 452 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 453 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 454 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 455 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 457 }
@@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 466 }
467 } 467 }
468 468
469 for_each_cpu_mask(i, cmd.mask) { 469 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 470 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 472 }
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..e4a4bf870e94 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..f1685fb91fbd 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..84bb395038d8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 66 return 800 + (fid * 100);
67} 67}
68 68
69
70/* Return a frequency in KHz, given an input fid */ 69/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 70static u32 find_khz_freq_from_fid(u32 fid)
72{ 71{
@@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 77 return data[pstate].frequency;
79} 78}
80 79
81
82/* Return the vco fid for an input fid 80/* Return the vco fid for an input fid
83 * 81 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 82 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +164,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 164 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 165}
168 166
169
170/* write the new fid value along with the other control fields to the msr */ 167/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 168static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 169{
@@ -966,7 +963,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 963 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 964 freqs.new = find_khz_freq_from_fid(fid);
968 965
969 for_each_cpu_mask(i, *(data->available_cores)) { 966 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 967 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 968 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 969 }
@@ -974,7 +971,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 971 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 972 freqs.new = find_khz_freq_from_fid(data->currfid);
976 973
977 for_each_cpu_mask(i, *(data->available_cores)) { 974 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 975 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 976 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 977 }
@@ -997,7 +994,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 994 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 995 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 996
1000 for_each_cpu_mask(i, *(data->available_cores)) { 997 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 998 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 999 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1000 }
@@ -1005,7 +1002,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1002 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1003 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1004
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1005 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1006 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1007 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1008 }
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..15e13c01cc36 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -28,7 +28,8 @@
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 358 int i;
348 359
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 360 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 361 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
362 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 363 return -ENODEV;
352 364
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 365 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 373 break;
362 374
363 if (i != N_IDS) 375 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 376 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 377
366 if (!centrino_cpu[policy->cpu]) { 378 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 379 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 380 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 381 MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 398 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 400 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 403 return -ENODEV;
391 } 404 }
392 } 405 }
393 406
394 freq = get_cur_freq(policy->cpu); 407 freq = get_cur_freq(policy->cpu);
395 408 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 409 /* 10uS transition latency */
397 policy->cur = freq; 410 policy->cur = freq;
398 411
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 412 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 413
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 414 ret = cpufreq_frequency_table_cpuinfo(policy,
415 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 416 if (ret)
403 return (ret); 417 return (ret);
404 418
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 419 cpufreq_frequency_table_get_attr(
420 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 421
407 return 0; 422 return 0;
408} 423}
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 426{
412 unsigned int cpu = policy->cpu; 427 unsigned int cpu = policy->cpu;
413 428
414 if (!centrino_model[cpu]) 429 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 430 return -ENODEV;
416 431
417 cpufreq_frequency_table_put_attr(cpu); 432 cpufreq_frequency_table_put_attr(cpu);
418 433
419 centrino_model[cpu] = NULL; 434 per_cpu(centrino_model, cpu) = NULL;
420 435
421 return 0; 436 return 0;
422} 437}
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 445 */
431static int centrino_verify (struct cpufreq_policy *policy) 446static int centrino_verify (struct cpufreq_policy *policy)
432{ 447{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 448 return cpufreq_frequency_table_verify(policy,
449 per_cpu(centrino_model, policy->cpu)->op_points);
434} 450}
435 451
436/** 452/**
437 * centrino_setpolicy - set a new CPUFreq policy 453 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 454 * @policy: new policy
439 * @target_freq: the target frequency 455 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 456 * @relation: how that frequency relates to achieved frequency
457 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 458 *
442 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
443 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
444static int centrino_target (struct cpufreq_policy *policy, 468static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 469 unsigned int target_freq,
446 unsigned int relation) 470 unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 472 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 473 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 474 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 475 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 476 unsigned int j, k, first_cpu, tmp;
457 477 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 478 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482
483 if (unlikely(allmasks == NULL))
484 return -ENOMEM;
485
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV;
488 goto out;
489 }
460 490
461 if (unlikely(cpufreq_frequency_table_target(policy, 491 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 492 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 493 target_freq,
464 relation, 494 relation,
465 &newstate))) { 495 &newstate))) {
466 return -EINVAL; 496 retval = -EINVAL;
497 goto out;
467 } 498 }
468 499
469#ifdef CONFIG_HOTPLUG_CPU 500#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 501 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 503#else
473 online_policy_cpus = policy->cpus; 504 *online_policy_cpus = policy->cpus;
474#endif 505#endif
475 506
476 saved_mask = current->cpus_allowed; 507 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 508 first_cpu = 1;
478 cpus_clear(covered_cpus); 509 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 510 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 511 /*
481 * Support for SMP systems. 512 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 513 * Make sure we are running on CPU that wants to change freq
483 */ 514 */
484 cpus_clear(set_mask); 515 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 517 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 518 else
488 cpu_set(j, set_mask); 519 cpu_set(j, *set_mask);
489 520
490 set_cpus_allowed_ptr(current, &set_mask); 521 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 522 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 524 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 525 retval = -EAGAIN;
495 if (first_cpu) { 526 if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 531 break;
501 } 532 }
502 533
503 msr = centrino_model[cpu]->op_points[newstate].index; 534 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 535
505 if (first_cpu) { 536 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 537 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 549 target_freq, freqs.old, freqs.new, msr);
519 550
520 for_each_cpu_mask(k, online_policy_cpus) { 551 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 552 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 553 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 554 CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 567 break;
537 } 568 }
538 569
539 cpu_set(j, covered_cpus); 570 cpu_set(j, *covered_cpus);
540 preempt_enable(); 571 preempt_enable();
541 } 572 }
542 573
543 for_each_cpu_mask(k, online_policy_cpus) { 574 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 575 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 577 }
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 584 * Best effort undo..
554 */ 585 */
555 586
556 if (!cpus_empty(covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
557 for_each_cpu_mask(j, covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
558 set_cpus_allowed_ptr(current, 589 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j)); 590 &cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 592 }
562 }
563 593
564 tmp = freqs.new; 594 tmp = freqs.new;
565 freqs.new = freqs.old; 595 freqs.new = freqs.old;
566 freqs.old = tmp; 596 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 597 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 598 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 601 }
572 } 602 }
573 set_cpus_allowed_ptr(current, &saved_mask); 603 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 604 retval = 0;
605 goto out;
575 606
576migrate_end: 607migrate_end:
577 preempt_enable(); 608 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 609 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 610out:
611 CPUMASK_FREE(allmasks);
612 return retval;
580} 613}
581 614
582static struct freq_attr* centrino_attr[] = { 615static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..191f7263c61d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask(i, policy->cpus) { 282 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..898a5a2002ed 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -134,23 +139,6 @@ static void __cpuinit set_cx86_memwb(void)
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
135} 140}
136 141
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152}
153
154/* 142/*
155 * Configure later MediaGX and/or Geode processor. 143 * Configure later MediaGX and/or Geode processor.
156 */ 144 */
@@ -174,11 +162,28 @@ static void __cpuinit geode_configure(void)
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -434,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 439static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 440 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 441 .c_ident = { "CyrixInstead" },
442 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 443 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 444 .c_identify = cyrix_identify,
439}; 445};
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
index e43ad4ad4cba..c9017799497c 100644
--- a/arch/x86/kernel/cpu/feature_names.c
+++ b/arch/x86/kernel/cpu/feature_names.c
@@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = {
39 NULL, NULL, NULL, NULL, 39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon", 40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL, 41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, 42 "rep_good", NULL, NULL, NULL,
43 "nopl", NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 44 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44 45
45 /* Intel-defined (#2) */ 46 /* Intel-defined (#2) */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..b75f2569b8f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229 229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
230#ifdef CONFIG_X86_NUMAQ 240#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable(); 241 numaq_tsc_disable();
232#endif 242#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..6b0a10b002f1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 489 int sibling;
490 490
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 491 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 493 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 494 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 495 }
@@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 780 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 781 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 782 cpuid4_cache_sysfs_exit(cpu);
783 break; 783 return retval;
784 } 784 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 785 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 786 }
787 if (!retval) 787 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 788
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 790 return 0;
792} 791}
793 792
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 793static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..726a5fcdf341 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = {
759}; 759};
760 760
761DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
762 763
763/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 766 static ssize_t show_ ## name(struct sys_device *s, \
767 struct sysdev_attribute *attr, \
768 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 769 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 770 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 771 static ssize_t set_ ## name(struct sys_device *s, \
772 struct sysdev_attribute *attr, \
773 const char *buf, size_t siz) { \
769 char *end; \ 774 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 775 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 776 if (end == buf) return -EINVAL; \
@@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 791ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 792ACCESSOR(bank5ctl,bank[5],mce_restart())
788 793
789static ssize_t show_trigger(struct sys_device *s, char *buf) 794static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
795 char *buf)
790{ 796{
791 strcpy(buf, trigger); 797 strcpy(buf, trigger);
792 strcat(buf, "\n"); 798 strcat(buf, "\n");
793 return strlen(trigger) + 1; 799 return strlen(trigger) + 1;
794} 800}
795 801
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 802static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
803 const char *buf,size_t siz)
797{ 804{
798 char *p; 805 char *p;
799 int len; 806 int len;
@@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 813}
807 814
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 815static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 816static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 817ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 818static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 819 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 820 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 821 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 822 NULL
816}; 823};
817 824
@@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 case CPU_ONLINE: 884 case CPU_ONLINE:
878 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
879 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
880 break; 889 break;
881 case CPU_DEAD: 890 case CPU_DEAD:
882 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
883 mce_remove_device(cpu); 894 mce_remove_device(cpu);
884 break; 895 break;
885 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..cb7d3b6a80eb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
379 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
380{ 380{
381 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
382 383
383 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
384 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,23 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
392 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
393 394
394 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
395 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
396 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 static int once = 1;
405
406 if (once) {
407 printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
408 once = 0;
409 }
410 mask_lo = tmp;
411 }
412 }
397 413
398 /* This works correctly if size is a power of two, i.e. a 414 /* This works correctly if size is a power of two, i.e. a
399 contiguous range. */ 415 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6f23969c8faf..b117d7f8a564 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -1496,11 +1496,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1496 1496
1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1498 if (!highest_pfn) { 1498 if (!highest_pfn) {
1499 if (!kvm_para_available()) { 1499 WARN(!kvm_para_available(), KERN_WARNING
1500 printk(KERN_WARNING
1501 "WARNING: strange, CPU MTRRs all blank?\n"); 1500 "WARNING: strange, CPU MTRRs all blank?\n");
1502 WARN_ON(1);
1503 }
1504 return 0; 1501 return 0;
1505 } 1502 }
1506 1503
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..05cc22dbd4ff 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
250 250
251 do_div(count, nmi_hz); 251 do_div(count, nmi_hz);
252 if(descr) 252 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count); 254 wrmsrl(perfctr_msr, 0 - count);
255} 255}
256 256
@@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
261 261
262 do_div(count, nmi_hz); 262 do_div(count, nmi_hz);
263 if(descr) 263 if(descr)
264 Dprintk("setting %s to -0x%08Lx\n", descr, count); 264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0); 265 wrmsr(perfctr_msr, (u32)(-count), 0);
266} 266}
267 267
@@ -478,7 +478,13 @@ static int setup_p4_watchdog(unsigned nmi_hz)
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 478 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0; 479 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1; 480 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 481
482 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
483 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
484 cccr_val = P4_CCCR_OVF_PMI0;
485 else
486 cccr_val = P4_CCCR_OVF_PMI1;
487 cccr_val |= P4_CCCR_ESCR_SELECT(4);
482 } 488 }
483 489
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 490 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..8e9cd6a8ec12 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -89,6 +89,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
89 struct cpuid_regs cmd; 89 struct cpuid_regs cmd;
90 int cpu = iminor(file->f_path.dentry->d_inode); 90 int cpu = iminor(file->f_path.dentry->d_inode);
91 u64 pos = *ppos; 91 u64 pos = *ppos;
92 ssize_t bytes = 0;
93 int err = 0;
92 94
93 if (count % 16) 95 if (count % 16)
94 return -EINVAL; /* Invalid chunk size */ 96 return -EINVAL; /* Invalid chunk size */
@@ -96,14 +98,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
96 for (; count; count -= 16) { 98 for (; count; count -= 16) {
97 cmd.eax = pos; 99 cmd.eax = pos;
98 cmd.ecx = pos >> 32; 100 cmd.ecx = pos >> 32;
99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); 101 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
100 if (copy_to_user(tmp, &cmd, 16)) 102 if (err)
101 return -EFAULT; 103 break;
104 if (copy_to_user(tmp, &cmd, 16)) {
105 err = -EFAULT;
106 break;
107 }
102 tmp += 16; 108 tmp += 16;
109 bytes += 16;
103 *ppos = ++pos; 110 *ppos = ++pos;
104 } 111 }
105 112
106 return tmp - buf; 113 return bytes ? bytes : err;
107} 114}
108 115
109static int cpuid_open(struct inode *inode, struct file *file) 116static int cpuid_open(struct inode *inode, struct file *file)
@@ -141,8 +148,8 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 148{
142 struct device *dev; 149 struct device *dev;
143 150
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 151 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
145 "cpu%d", cpu); 152 NULL, "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 153 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 154}
148 155
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..66e48aa2dd1b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 878 count++;
879 879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
881 for (i = 0; i < count; i++) { 882 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 883 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1202,7 +1203,7 @@ static int __init parse_memmap_opt(char *p)
1202 if (!p) 1203 if (!p)
1203 return -EINVAL; 1204 return -EINVAL;
1204 1205
1205 if (!strcmp(p, "exactmap")) { 1206 if (!strncmp(p, "exactmap", 8)) {
1206#ifdef CONFIG_CRASH_DUMP 1207#ifdef CONFIG_CRASH_DUMP
1207 /* 1208 /*
1208 * If we are doing a crash dump, we still need to know 1209 * If we are doing a crash dump, we still need to know
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
1298 } 1299 }
1299} 1300}
1300 1301
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1306char *__init default_machine_specific_memory_setup(void) 1302char *__init default_machine_specific_memory_setup(void)
1307{ 1303{
1308 char *who = "BIOS-e820"; 1304 char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1339
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1341{
1346 if (arch_memory_setup_quirk) { 1342 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1343 char *who = x86_quirks->arch_memory_setup();
1348 1344
1349 if (who) 1345 if (who)
1350 return who; 1346 return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1364 e820_print_map(who);
1369} 1365}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..4353cf5e6fac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
53 * directory. If I have PAE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PAE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PAE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..109792bc7cfa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 425 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 427 jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
383 # setting need_resched or sigpending 434 # setting need_resched or sigpending
384 # between sampling and the iret 435 # between sampling and the iret
385 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 439 jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
514syscall_trace_entry: 561syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 563 movl %esp, %eax
517 xorl %edx,%edx 564 call syscall_trace_enter
518 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 567 jnae syscall_call
525 jmp syscall_exit 568 jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 571 # perform syscall exit tracing
529 ALIGN 572 ALIGN
530syscall_exit_work: 573syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 575 jz work_pending
533 TRACE_IRQS_ON 576 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 578 # schedule() instead
536 movl %esp, %eax 579 movl %esp, %eax
537 movl $1, %edx 580 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 581 jmp resume_userspace
540END(syscall_exit_work) 582END(syscall_exit_work)
541 CFI_ENDPROC 583 CFI_ENDPROC
@@ -1024,6 +1066,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1066ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1067 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1068 addl $5*4, %esp /* remove xen-provided frame */
1069 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1070 jmp sysenter_past_esp
1028 CFI_ENDPROC 1071 CFI_ENDPROC
1029 1072
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..89434d439605 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,6 +53,12 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
@@ -349,9 +355,9 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 355 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 356 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
355 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
356 ja badsys 362 ja badsys
357 movq %r10,%rcx 363 movq %r10,%rcx
@@ -403,16 +409,16 @@ sysret_careful:
403sysret_signal: 409sysret_signal:
404 TRACE_IRQS_ON 410 TRACE_IRQS_ON
405 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
406 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
407 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
408 414 jc sysret_audit
409 /* Really a signal */ 415#endif
410 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
411 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
413 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
414 call ptregscall_common 420 call ptregscall_common
4151: movl $_TIF_WORK_MASK,%edi 421 movl $_TIF_WORK_MASK,%edi
416 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
418 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +429,56 @@ badsys:
423 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
424 jmp ret_from_sys_call 430 jmp ret_from_sys_call
425 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
426 /* Do syscall tracing */ 465 /* Do syscall tracing */
427tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
428 SAVE_REST 471 SAVE_REST
429 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
430 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 474 movq %rsp,%rdi
432 call syscall_trace_enter 475 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 476 /*
477 * Reload arg registers from stack in case ptrace changed them.
478 * We don't reload %rax because syscall_trace_enter() returned
479 * the value it wants us to use in the table lookup.
480 */
481 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 482 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 483 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 484 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +492,7 @@ tracesys:
444 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
445 */ 493 */
446 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
447int_ret_from_sys_call: 496int_ret_from_sys_call:
448 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
449 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
@@ -483,7 +532,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 532 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 533 SAVE_REST
485 /* Check for syscall exit trace */ 534 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 535 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 536 jz int_signal
488 pushq %rdi 537 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 538 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +540,7 @@ int_very_careful:
491 call syscall_trace_leave 540 call syscall_trace_leave
492 popq %rdi 541 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 542 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 543 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 544 jmp int_restore_rest
496 545
497int_signal: 546int_signal:
@@ -1189,6 +1238,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1238 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1239KPROBE_ENTRY(debug)
1191 INTR_FRAME 1240 INTR_FRAME
1241 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1242 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1243 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1244 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1248,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1248 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1249KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1250 INTR_FRAME
1251 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1252 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1253 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1254 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1262,7 @@ KPROBE_END(nmi)
1211 1262
1212KPROBE_ENTRY(int3) 1263KPROBE_ENTRY(int3)
1213 INTR_FRAME 1264 INTR_FRAME
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1266 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1267 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1268 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1289,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1289 /* runs on exception stack */
1238ENTRY(double_fault) 1290ENTRY(double_fault)
1239 XCPT_FRAME 1291 XCPT_FRAME
1292 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1293 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1294 jmp paranoid_exit1
1242 CFI_ENDPROC 1295 CFI_ENDPROC
@@ -1253,6 +1306,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1306 /* runs on exception stack */
1254ENTRY(stack_segment) 1307ENTRY(stack_segment)
1255 XCPT_FRAME 1308 XCPT_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1310 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1311 jmp paranoid_exit1
1258 CFI_ENDPROC 1312 CFI_ENDPROC
@@ -1278,6 +1332,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1332 /* runs on exception stack */
1279ENTRY(machine_check) 1333ENTRY(machine_check)
1280 INTR_FRAME 1334 INTR_FRAME
1335 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1336 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1337 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1338 paranoidentry do_machine_check
@@ -1312,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1367 sysret
1313 CFI_ENDPROC 1368 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1369ENDPROC(ignore_sysret)
1370
1371#ifdef CONFIG_XEN
1372ENTRY(xen_hypervisor_callback)
1373 zeroentry xen_do_hypervisor_callback
1374END(xen_hypervisor_callback)
1375
1376/*
1377# A note on the "critical region" in our callback handler.
1378# We want to avoid stacking callback handlers due to events occurring
1379# during handling of the last event. To do this, we keep events disabled
1380# until we've done all processing. HOWEVER, we must enable events before
1381# popping the stack frame (can't be done atomically) and so it would still
1382# be possible to get enough handler activations to overflow the stack.
1383# Although unlikely, bugs of that kind are hard to track down, so we'd
1384# like to avoid the possibility.
1385# So, on entry to the handler we detect whether we interrupted an
1386# existing activation in its critical region -- if so, we pop the current
1387# activation and restart the handler using the previous one.
1388*/
1389ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1390 CFI_STARTPROC
1391/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1392 see the correct pointer to the pt_regs */
1393 movq %rdi, %rsp # we don't return, adjust the stack frame
1394 CFI_ENDPROC
1395 CFI_DEFAULT_STACK
139611: incl %gs:pda_irqcount
1397 movq %rsp,%rbp
1398 CFI_DEF_CFA_REGISTER rbp
1399 cmovzq %gs:pda_irqstackptr,%rsp
1400 pushq %rbp # backlink for old unwinder
1401 call xen_evtchn_do_upcall
1402 popq %rsp
1403 CFI_DEF_CFA_REGISTER rsp
1404 decl %gs:pda_irqcount
1405 jmp error_exit
1406 CFI_ENDPROC
1407END(do_hypervisor_callback)
1408
1409/*
1410# Hypervisor uses this for application faults while it executes.
1411# We get here for two reasons:
1412# 1. Fault while reloading DS, ES, FS or GS
1413# 2. Fault while executing IRET
1414# Category 1 we do not need to fix up as Xen has already reloaded all segment
1415# registers that could be reloaded and zeroed the others.
1416# Category 2 we fix up by killing the current process. We cannot use the
1417# normal Linux return path in this case because if we use the IRET hypercall
1418# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1419# We distinguish between categories by comparing each saved segment register
1420# with its current contents: any discrepancy means we in category 1.
1421*/
1422ENTRY(xen_failsafe_callback)
1423 framesz = (RIP-0x30) /* workaround buggy gas */
1424 _frame framesz
1425 CFI_REL_OFFSET rcx, 0
1426 CFI_REL_OFFSET r11, 8
1427 movw %ds,%cx
1428 cmpw %cx,0x10(%rsp)
1429 CFI_REMEMBER_STATE
1430 jne 1f
1431 movw %es,%cx
1432 cmpw %cx,0x18(%rsp)
1433 jne 1f
1434 movw %fs,%cx
1435 cmpw %cx,0x20(%rsp)
1436 jne 1f
1437 movw %gs,%cx
1438 cmpw %cx,0x28(%rsp)
1439 jne 1f
1440 /* All segments match their saved values => Category 2 (Bad IRET). */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 pushq %r11
1450 CFI_ADJUST_CFA_OFFSET 8
1451 pushq %rcx
1452 CFI_ADJUST_CFA_OFFSET 8
1453 jmp general_protection
1454 CFI_RESTORE_STATE
14551: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1456 movq (%rsp),%rcx
1457 CFI_RESTORE rcx
1458 movq 8(%rsp),%r11
1459 CFI_RESTORE r11
1460 addq $0x30,%rsp
1461 CFI_ADJUST_CFA_OFFSET -0x30
1462 pushq $0
1463 CFI_ADJUST_CFA_OFFSET 8
1464 SAVE_ALL
1465 jmp error_exit
1466 CFI_ENDPROC
1467END(xen_failsafe_callback)
1468
1469#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..eaff0bbb1444 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -99,3 +99,4 @@ int is_uv_system(void)
99{ 99{
100 return uv_system_type != UV_NONE; 100 return uv_system_type != UV_NONE;
101} 101}
102EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..786548a62d38 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 168 * May as well be the first.
169 */ 169 */
170 cpu = first_cpu(cpumask); 170 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 171 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 172 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 173 else
174 return BAD_APICID; 174 return BAD_APICID;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..bfa837cb16be 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
27 28
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 41short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
42 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 48
45static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -94,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
94{ 98{
95 unsigned int cpu; 99 unsigned int cpu;
96 100
97 for (cpu = 0; cpu < NR_CPUS; ++cpu) 101 for_each_possible_cpu(cpu)
98 if (cpu_isset(cpu, mask)) 102 if (cpu_isset(cpu, mask))
99 uv_send_IPI_one(cpu, vector); 103 uv_send_IPI_one(cpu, vector);
100} 104}
@@ -128,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
128 * May as well be the first. 132 * May as well be the first.
129 */ 133 */
130 cpu = first_cpu(cpumask); 134 cpu = first_cpu(cpumask);
131 if ((unsigned)cpu < NR_CPUS) 135 if ((unsigned)cpu < nr_cpu_ids)
132 return per_cpu(x86_cpu_to_apicid, cpu); 136 return per_cpu(x86_cpu_to_apicid, cpu);
133 else 137 else
134 return BAD_APICID; 138 return BAD_APICID;
@@ -218,7 +222,7 @@ static __init void map_low_mmrs(void)
218 222
219enum map_type {map_wb, map_uc}; 223enum map_type {map_wb, map_uc};
220 224
221static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 225static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
222{ 226{
223 unsigned long bytes, paddr; 227 unsigned long bytes, paddr;
224 228
@@ -272,7 +276,26 @@ static __init void map_mmioh_high(int max_pnode)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273} 277}
274 278
275static __init void uv_system_init(void) 279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
296static bool uv_system_inited;
297
298void __init uv_system_init(void)
276{ 299{
277 union uvh_si_addr_map_config_u m_n_config; 300 union uvh_si_addr_map_config_u m_n_config;
278 union uvh_node_id_u node_id; 301 union uvh_node_id_u node_id;
@@ -326,6 +349,8 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 349 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 350 ~((1 << n_val) - 1)) << m_val;
328 351
352 uv_rtc_init();
353
329 for_each_present_cpu(cpu) { 354 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 355 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 356 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
@@ -360,6 +385,7 @@ static __init void uv_system_init(void)
360 map_mmr_high(max_pnode); 385 map_mmr_high(max_pnode);
361 map_config_high(max_pnode); 386 map_config_high(max_pnode);
362 map_mmioh_high(max_pnode); 387 map_mmioh_high(max_pnode);
388 uv_system_inited = true;
363} 389}
364 390
365/* 391/*
@@ -368,8 +394,7 @@ static __init void uv_system_init(void)
368 */ 394 */
369void __cpuinit uv_cpu_init(void) 395void __cpuinit uv_cpu_init(void)
370{ 396{
371 if (!uv_node_to_blade) 397 BUG_ON(!uv_system_inited);
372 uv_system_init();
373 398
374 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 399 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
375 400
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..9bfc4d72fb2e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
81 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
82 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
83 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
84 92
85 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
86 clear_bss(); 94 clear_bss();
@@ -102,9 +110,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 110
103 early_printk("Kernel alive\n"); 111 early_printk("Kernel alive\n");
104 112
105 _cpu_pda = __cpu_pda; 113 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 114
109 early_printk("Kernel really alive\n"); 115 early_printk("Kernel really alive\n");
110 116
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..a7010c3a377a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP
4561: 4561:
457#endif /* CONFIG_SMP */ 457#endif /* CONFIG_SMP */
458 jmp *(initial_code) 458 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 459
463/* 460/*
464 * We depend on ET to be correct. This checks for 287/387. 461 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +598,11 @@ ignore_int:
601#endif 598#endif
602 iret 599 iret
603 600
601.section .cpuinit.data,"wa"
602.align 4
603ENTRY(initial_code)
604 .long i386_start_kernel
605
604.section .text 606.section .text
605/* 607/*
606 * Real beginning of normal "text" segment 608 * Real beginning of normal "text" segment
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..73deaffadd03 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 212 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 213 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 214 hpet_clockevent.min_delta_ns = 5000;
215 215
216 /* 216 /*
217 * Start hpet with the boot cpu mask and make it 217 * Start hpet with the boot cpu mask and make it
@@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
270} 270}
271 271
272static int hpet_legacy_next_event(unsigned long delta, 272static int hpet_legacy_next_event(unsigned long delta,
273 struct clock_event_device *evt) 273 struct clock_event_device *evt)
274{ 274{
275 unsigned long cnt; 275 u32 cnt;
276 276
277 cnt = hpet_readl(HPET_COUNTER); 277 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 278 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 279 hpet_writel(cnt, HPET_T0_CMP);
280 280
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 281 /*
282 * We need to read back the CMP register to make sure that
283 * what we wrote hit the chip before we compare it to the
284 * counter.
285 */
286 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
287
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
282} 289}
283 290
284/* 291/*
@@ -359,6 +366,7 @@ static int hpet_clocksource_register(void)
359int __init hpet_enable(void) 366int __init hpet_enable(void)
360{ 367{
361 unsigned long id; 368 unsigned long id;
369 int i;
362 370
363 if (!is_hpet_capable()) 371 if (!is_hpet_capable())
364 return 0; 372 return 0;
@@ -369,6 +377,29 @@ int __init hpet_enable(void)
369 * Read the period and check for a sane value: 377 * Read the period and check for a sane value:
370 */ 378 */
371 hpet_period = hpet_readl(HPET_PERIOD); 379 hpet_period = hpet_readl(HPET_PERIOD);
380
381 /*
382 * AMD SB700 based systems with spread spectrum enabled use a
383 * SMM based HPET emulation to provide proper frequency
384 * setting. The SMM code is initialized with the first HPET
385 * register access and takes some time to complete. During
386 * this time the config register reads 0xffffffff. We check
387 * for max. 1000 loops whether the config register reads a non
388 * 0xffffffff value to make sure that HPET is up and running
389 * before we go further. A counting loop is safe, as the HPET
390 * access takes thousands of CPU cycles. On non SB700 based
391 * machines this check is only done once and has no side
392 * effects.
393 */
394 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
395 if (i == 1000) {
396 printk(KERN_WARNING
397 "HPET config register value = 0xFFFFFFFF. "
398 "Disabling HPET\n");
399 goto out_nohpet;
400 }
401 }
402
372 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 403 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
373 goto out_nohpet; 404 goto out_nohpet;
374 405
@@ -468,7 +499,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 499#define RTC_NUM_INTS 1
469 500
470static unsigned long hpet_rtc_flags; 501static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 502static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 503static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 504static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 505static unsigned long hpet_t1_cmp;
@@ -575,6 +606,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 606
576 hpet_rtc_flags |= bit_mask; 607 hpet_rtc_flags |= bit_mask;
577 608
609 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
610 hpet_prev_update_sec = -1;
611
578 if (!oldbits) 612 if (!oldbits)
579 hpet_rtc_timer_init(); 613 hpet_rtc_timer_init();
580 614
@@ -652,7 +686,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 686 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 687 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 688 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 689 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 690 lost_ints);
657 } 691 }
658} 692}
@@ -670,7 +704,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 704
671 if (hpet_rtc_flags & RTC_UIE && 705 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 706 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 707 if (hpet_prev_update_sec >= 0)
708 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 709 hpet_prev_update_sec = curr_time.tm_sec;
675 } 710 }
676 711
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 558abf4c796a..09cddb57bec4 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -57,7 +57,7 @@ atomic_t irq_mis_count;
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58 58
59static DEFINE_SPINLOCK(ioapic_lock); 59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock); 60DEFINE_SPINLOCK(vector_lock);
61 61
62int timer_through_8259 __initdata; 62int timer_through_8259 __initdata;
63 63
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
756 /* 756 /*
757 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
758 */ 758 */
759 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
760} 760}
761#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
762 762
@@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq)
1209 return vector; 1209 return vector;
1210} 1210}
1211 1211
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1216static struct irq_chip ioapic_chip; 1212static struct irq_chip ioapic_chip;
1217 1213
1218#define IOAPIC_AUTO -1 1214#define IOAPIC_AUTO -1
@@ -2030,7 +2026,7 @@ static void mask_lapic_irq(unsigned int irq)
2030 unsigned long v; 2026 unsigned long v;
2031 2027
2032 v = apic_read(APIC_LVT0); 2028 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2029 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2034} 2030}
2035 2031
2036static void unmask_lapic_irq(unsigned int irq) 2032static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2034,7 @@ static void unmask_lapic_irq(unsigned int irq)
2038 unsigned long v; 2034 unsigned long v;
2039 2035
2040 v = apic_read(APIC_LVT0); 2036 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2037 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042} 2038}
2043 2039
2044static struct irq_chip lapic_chip __read_mostly = { 2040static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2164,7 @@ static inline void __init check_timer(void)
2168 * The AEOI mode will finish them in the 8259A 2164 * The AEOI mode will finish them in the 8259A
2169 * automatically. 2165 * automatically.
2170 */ 2166 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2167 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1); 2168 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2169 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174 2170
@@ -2177,8 +2173,9 @@ static inline void __init check_timer(void)
2177 pin2 = ioapic_i8259.pin; 2173 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic; 2174 apic2 = ioapic_i8259.apic;
2179 2175
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2176 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2181 vector, apic1, pin1, apic2, pin2); 2177 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2178 vector, apic1, pin1, apic2, pin2);
2182 2179
2183 /* 2180 /*
2184 * Some BIOS writers are clueless and report the ExtINTA 2181 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2213,13 @@ static inline void __init check_timer(void)
2216 } 2213 }
2217 clear_IO_APIC_pin(apic1, pin1); 2214 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1) 2215 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: " 2216 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n"); 2217 "8254 timer not connected to IO-APIC\n");
2221 2218
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) " 2219 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2223 "through the 8259A ... "); 2220 "(IRQ0) through the 8259A ...\n");
2224 printk("\n..... (found pin %d) ...", pin2); 2221 apic_printk(APIC_QUIET, KERN_INFO
2222 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2225 /* 2223 /*
2226 * legacy devices should be connected to IO APIC #0 2224 * legacy devices should be connected to IO APIC #0
2227 */ 2225 */
@@ -2230,7 +2228,7 @@ static inline void __init check_timer(void)
2230 unmask_IO_APIC_irq(0); 2228 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0); 2229 enable_8259A_irq(0);
2232 if (timer_irq_works()) { 2230 if (timer_irq_works()) {
2233 printk("works.\n"); 2231 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2234 timer_through_8259 = 1; 2232 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) { 2233 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0); 2234 disable_8259A_irq(0);
@@ -2244,44 +2242,47 @@ static inline void __init check_timer(void)
2244 */ 2242 */
2245 disable_8259A_irq(0); 2243 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2); 2244 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n"); 2245 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2248 } 2246 }
2249 2247
2250 if (nmi_watchdog == NMI_IO_APIC) { 2248 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2249 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2250 "through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE; 2251 nmi_watchdog = NMI_NONE;
2253 } 2252 }
2254 timer_ack = 0; 2253 timer_ack = 0;
2255 2254
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2255 apic_printk(APIC_QUIET, KERN_INFO
2256 "...trying to set up timer as Virtual Wire IRQ...\n");
2257 2257
2258 lapic_register_intr(0, vector); 2258 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2259 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0); 2260 enable_8259A_irq(0);
2261 2261
2262 if (timer_irq_works()) { 2262 if (timer_irq_works()) {
2263 printk(" works.\n"); 2263 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out; 2264 goto out;
2265 } 2265 }
2266 disable_8259A_irq(0); 2266 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2267 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n"); 2268 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269 2269
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2270 apic_printk(APIC_QUIET, KERN_INFO
2271 "...trying to set up timer as ExtINT IRQ...\n");
2271 2272
2272 init_8259A(0); 2273 init_8259A(0);
2273 make_8259A_irq(0); 2274 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2275 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2275 2276
2276 unlock_ExtINT_logic(); 2277 unlock_ExtINT_logic();
2277 2278
2278 if (timer_irq_works()) { 2279 if (timer_irq_works()) {
2279 printk(" works.\n"); 2280 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2280 goto out; 2281 goto out;
2281 } 2282 }
2282 printk(" failed :(.\n"); 2283 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2284 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option"); 2285 "report. Then try booting with the 'noapic' option.\n");
2285out: 2286out:
2286 local_irq_restore(flags); 2287 local_irq_restore(flags);
2287} 2288}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6510cde36b35..61a83b70c18f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -100,7 +101,7 @@ int timer_through_8259 __initdata;
100static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
101 102
102static DEFINE_SPINLOCK(ioapic_lock); 103static DEFINE_SPINLOCK(ioapic_lock);
103DEFINE_SPINLOCK(vector_lock); 104static DEFINE_SPINLOCK(vector_lock);
104 105
105/* 106/*
106 * # of IRQ routing registers 107 * # of IRQ routing registers
@@ -696,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin)
696 return irq; 697 return irq;
697} 698}
698 699
700void lock_vector_lock(void)
701{
702 /* Used to the online set of cpus does not change
703 * during assign_irq_vector.
704 */
705 spin_lock(&vector_lock);
706}
707
708void unlock_vector_lock(void)
709{
710 spin_unlock(&vector_lock);
711}
712
699static int __assign_irq_vector(int irq, cpumask_t mask) 713static int __assign_irq_vector(int irq, cpumask_t mask)
700{ 714{
701 /* 715 /*
@@ -731,7 +745,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
731 return 0; 745 return 0;
732 } 746 }
733 747
734 for_each_cpu_mask(cpu, mask) { 748 for_each_cpu_mask_nr(cpu, mask) {
735 cpumask_t domain, new_mask; 749 cpumask_t domain, new_mask;
736 int new_cpu; 750 int new_cpu;
737 int vector, offset; 751 int vector, offset;
@@ -752,7 +766,7 @@ next:
752 continue; 766 continue;
753 if (vector == IA32_SYSCALL_VECTOR) 767 if (vector == IA32_SYSCALL_VECTOR)
754 goto next; 768 goto next;
755 for_each_cpu_mask(new_cpu, new_mask) 769 for_each_cpu_mask_nr(new_cpu, new_mask)
756 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 770 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
757 goto next; 771 goto next;
758 /* Found one! */ 772 /* Found one! */
@@ -762,7 +776,7 @@ next:
762 cfg->move_in_progress = 1; 776 cfg->move_in_progress = 1;
763 cfg->old_domain = cfg->domain; 777 cfg->old_domain = cfg->domain;
764 } 778 }
765 for_each_cpu_mask(new_cpu, new_mask) 779 for_each_cpu_mask_nr(new_cpu, new_mask)
766 per_cpu(vector_irq, new_cpu)[vector] = irq; 780 per_cpu(vector_irq, new_cpu)[vector] = irq;
767 cfg->vector = vector; 781 cfg->vector = vector;
768 cfg->domain = domain; 782 cfg->domain = domain;
@@ -794,14 +808,14 @@ static void __clear_irq_vector(int irq)
794 808
795 vector = cfg->vector; 809 vector = cfg->vector;
796 cpus_and(mask, cfg->domain, cpu_online_map); 810 cpus_and(mask, cfg->domain, cpu_online_map);
797 for_each_cpu_mask(cpu, mask) 811 for_each_cpu_mask_nr(cpu, mask)
798 per_cpu(vector_irq, cpu)[vector] = -1; 812 per_cpu(vector_irq, cpu)[vector] = -1;
799 813
800 cfg->vector = 0; 814 cfg->vector = 0;
801 cpus_clear(cfg->domain); 815 cpus_clear(cfg->domain);
802} 816}
803 817
804static void __setup_vector_irq(int cpu) 818void __setup_vector_irq(int cpu)
805{ 819{
806 /* Initialize vector_irq on a new cpu */ 820 /* Initialize vector_irq on a new cpu */
807 /* This function must be called with vector_lock held */ 821 /* This function must be called with vector_lock held */
@@ -824,14 +838,6 @@ static void __setup_vector_irq(int cpu)
824 } 838 }
825} 839}
826 840
827void setup_vector_irq(int cpu)
828{
829 spin_lock(&vector_lock);
830 __setup_vector_irq(smp_processor_id());
831 spin_unlock(&vector_lock);
832}
833
834
835static struct irq_chip ioapic_chip; 841static struct irq_chip ioapic_chip;
836 842
837static void ioapic_register_intr(int irq, unsigned long trigger) 843static void ioapic_register_intr(int irq, unsigned long trigger)
@@ -1372,12 +1378,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1372static int ioapic_retrigger_irq(unsigned int irq) 1378static int ioapic_retrigger_irq(unsigned int irq)
1373{ 1379{
1374 struct irq_cfg *cfg = &irq_cfg[irq]; 1380 struct irq_cfg *cfg = &irq_cfg[irq];
1375 cpumask_t mask;
1376 unsigned long flags; 1381 unsigned long flags;
1377 1382
1378 spin_lock_irqsave(&vector_lock, flags); 1383 spin_lock_irqsave(&vector_lock, flags);
1379 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1384 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1380 send_IPI_mask(mask, cfg->vector);
1381 spin_unlock_irqrestore(&vector_lock, flags); 1385 spin_unlock_irqrestore(&vector_lock, flags);
1382 1386
1383 return 1; 1387 return 1;
@@ -1696,8 +1700,9 @@ static inline void __init check_timer(void)
1696 pin2 = ioapic_i8259.pin; 1700 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 1701 apic2 = ioapic_i8259.apic;
1698 1702
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1703 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 1704 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1705 cfg->vector, apic1, pin1, apic2, pin2);
1701 1706
1702 /* 1707 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 1708 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1740,13 @@ static inline void __init check_timer(void)
1735 } 1740 }
1736 clear_IO_APIC_pin(apic1, pin1); 1741 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 1742 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 1743 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 1744 "8254 timer not connected to IO-APIC\n");
1740 1745
1741 apic_printk(APIC_VERBOSE,KERN_INFO 1746 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 1747 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 1748 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1749 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 1750 /*
1747 * legacy devices should be connected to IO APIC #0 1751 * legacy devices should be connected to IO APIC #0
1748 */ 1752 */
@@ -1751,7 +1755,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 1755 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 1756 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 1757 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 1758 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 1759 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 1760 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 1761 disable_8259A_irq(0);
@@ -1765,29 +1769,32 @@ static inline void __init check_timer(void)
1765 */ 1769 */
1766 disable_8259A_irq(0); 1770 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 1771 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 1772 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 1773 }
1770 1774
1771 if (nmi_watchdog == NMI_IO_APIC) { 1775 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1776 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1777 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 1778 nmi_watchdog = NMI_NONE;
1774 } 1779 }
1775 1780
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1781 apic_printk(APIC_QUIET, KERN_INFO
1782 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 1783
1778 lapic_register_intr(0); 1784 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1785 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 1786 enable_8259A_irq(0);
1781 1787
1782 if (timer_irq_works()) { 1788 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 1789 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 1790 goto out;
1785 } 1791 }
1786 disable_8259A_irq(0); 1792 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1793 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 1794 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 1795
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1796 apic_printk(APIC_QUIET, KERN_INFO
1797 "...trying to set up timer as ExtINT IRQ...\n");
1791 1798
1792 init_8259A(0); 1799 init_8259A(0);
1793 make_8259A_irq(0); 1800 make_8259A_irq(0);
@@ -1796,11 +1803,12 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 1803 unlock_ExtINT_logic();
1797 1804
1798 if (timer_irq_works()) { 1805 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 1806 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 1807 goto out;
1801 } 1808 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 1809 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1810 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1811 "report. Then try booting with the 'noapic' option.\n");
1804out: 1812out:
1805 local_irq_restore(flags); 1813 local_irq_restore(flags);
1806} 1814}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
@@ -103,6 +111,9 @@ void __init io_delay_init(void)
103 111
104static int __init io_delay_param(char *s) 112static int __init io_delay_param(char *s)
105{ 113{
114 if (!s)
115 return -EINVAL;
116
106 if (!strcmp(s, "0x80")) 117 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 118 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 119 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..3f7537b669d3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 70 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 72 */
73 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
74} 74}
75 75
76void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 98 * prepare target chip field
99 */ 99 */
100 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
102 102
103 /* 103 /*
104 * program the ICR 104 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 108 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 110 */
111 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
112} 112}
113 113
114/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..1f26fd9ec4f4 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
135 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
136 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
137 if (!data) { 141 if (!data) {
142 kfree(node);
138 error = -ENXIO; 143 error = -ENXIO;
139 goto err_dir; 144 goto err_dir;
140 } 145 }
@@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void)
209{ 214{
210 int error = 0; 215 int error = 0;
211 216
217 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
218 if (!arch_debugfs_dir)
219 return -ENOMEM;
220
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 221#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 222 error = boot_params_kdebugfs_init();
214#endif 223#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..8282a2139681 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -466,9 +481,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 481
467 case DIE_DEBUG: 482 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 483 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 484 raw_smp_processor_id()) {
470 user_mode(regs)) 485 if (user_mode(regs))
471 return single_step_cont(regs, args); 486 return single_step_cont(regs, args);
487 break;
488 } else if (test_thread_flag(TIF_SINGLESTEP))
489 /* This means a user thread is single stepping
490 * a system call which should be ignored
491 */
492 return NOTIFY_DONE;
472 /* fall through */ 493 /* fall through */
473 default: 494 default:
474 if (user_mode(regs)) 495 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..d02def06ca91 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..b68e21f06f4f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 62
63 if (reload) { 63 if (reload) {
64#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
65 cpumask_t mask;
66
67 preempt_disable(); 65 preempt_disable();
68 load_LDT(pc); 66 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 67 if (!cpus_equal(current->mm->cpu_vm_mask,
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 68 cpumask_of_cpu(smp_processor_id())))
71 smp_call_function(flush_ldt, current->mm, 1); 69 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 70 preempt_enable();
73#else 71#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
22#include <asm/cpufeature.h> 23#include <asm/cpufeature.h>
23#include <asm/desc.h> 24#include <asm/desc.h>
24#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/cacheflush.h>
25 27
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 29static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
77/* 79/*
78 * A architecture hook called to validate the 80 * A architecture hook called to validate the
79 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
80 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
81 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
82 * been copied into the kernel. 84 * been copied into the kernel.
83 * 85 *
@@ -85,10 +87,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 87 * reboot code buffer to allow us to avoid allocations
86 * later. 88 * later.
87 * 89 *
88 * Currently nothing. 90 * Make control page executable.
89 */ 91 */
90int machine_kexec_prepare(struct kimage *image) 92int machine_kexec_prepare(struct kimage *image)
91{ 93{
94 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1);
92 return 0; 96 return 0;
93} 97}
94 98
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 102 */
99void machine_kexec_cleanup(struct kimage *image) 103void machine_kexec_cleanup(struct kimage *image)
100{ 104{
105 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1);
101} 107}
102 108
103/* 109/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 110 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 111 * We are past the point of no return, committed to rebooting now.
106 */ 112 */
107NORET_TYPE void machine_kexec(struct kimage *image) 113void machine_kexec(struct kimage *image)
108{ 114{
109 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
110 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
118 asmlinkage unsigned long
119 (*relocate_kernel_ptr)(unsigned long indirection_page,
120 unsigned long control_page,
121 unsigned long start_address,
122 unsigned int has_pae,
123 unsigned int preserve_context);
124
125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
111 129
112 tracer_disable(); 130 save_ftrace_enabled = __ftrace_enabled_save();
113 131
114 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 133 local_irq_disable();
116 134
135 if (image->preserve_context) {
136#ifdef CONFIG_X86_IO_APIC
137 /* We need to put APICs in legacy mode so that we can
138 * get timer interrupts in second kernel. kexec/kdump
139 * paths already have calls to disable_IO_APIC() in
140 * one form or other. kexec jump path also need
141 * one.
142 */
143 disable_IO_APIC();
144#endif
145 }
146
117 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
119 149
150 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 153 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 155#ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 163 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 166
135 /* The segment registers are funny things, they have both a 167 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 168 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 181 set_idt(phys_to_virt(0),0);
150 182
151 /* now call it */ 183 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 184 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 185 (unsigned long)page_list,
186 image->start, cpu_has_pae,
187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
154} 195}
155 196
156void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a04..652fa5c38ebe 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu)
644 mutex_unlock(&microcode_mutex); 644 mutex_unlock(&microcode_mutex);
645} 645}
646 646
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) 647static ssize_t reload_store(struct sys_device *dev,
648 struct sysdev_attribute *attr,
649 const char *buf, size_t sz)
648{ 650{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 651 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end; 652 char *end;
@@ -655,9 +657,7 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
655 if (end == buf) 657 if (end == buf)
656 return -EINVAL; 658 return -EINVAL;
657 if (val == 1) { 659 if (val == 1) {
658 cpumask_t old; 660 cpumask_t old = current->cpus_allowed;
659
660 old = current->cpus_allowed;
661 661
662 get_online_cpus(); 662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
@@ -674,14 +674,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
674 return sz; 674 return sz;
675} 675}
676 676
677static ssize_t version_show(struct sys_device *dev, char *buf) 677static ssize_t version_show(struct sys_device *dev,
678 struct sysdev_attribute *attr, char *buf)
678{ 679{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 680 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680 681
681 return sprintf(buf, "0x%x\n", uci->rev); 682 return sprintf(buf, "0x%x\n", uci->rev);
682} 683}
683 684
684static ssize_t pf_show(struct sys_device *dev, char *buf) 685static ssize_t pf_show(struct sys_device *dev,
686 struct sysdev_attribute *attr, char *buf)
685{ 687{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 688 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687 689
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
238 {} 238 {}
239}; 239};
240 240
241void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
242{ 242{
243 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
244} 244}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..b3fb430725cb 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ 52static void __init MP_processor_info(struct mpc_config_processor *m)
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
124 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
228 154
229static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
230{ 156{
231 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
232 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
233 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
234 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
237 163
238static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
239{ 165{
240 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
241 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
242 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
243 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
309 235
310static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
311{ 237{
312 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
313 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
314 m->mpc_irqtype, m->mpc_irqflag & 3, 240 m->mpc_irqtype, m->mpc_irqflag & 3,
315 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
@@ -632,7 +484,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
632} 484}
633 485
634 486
635static void construct_ioapic_table(int mpc_default_type) 487static void __init construct_ioapic_table(int mpc_default_type)
636{ 488{
637 struct mpc_config_ioapic ioapic; 489 struct mpc_config_ioapic ioapic;
638 struct mpc_config_bus bus; 490 struct mpc_config_bus bus;
@@ -677,7 +529,7 @@ static void construct_ioapic_table(int mpc_default_type)
677 construct_default_ioirq_mptable(mpc_default_type); 529 construct_default_ioirq_mptable(mpc_default_type);
678} 530}
679#else 531#else
680static inline void construct_ioapic_table(int mpc_default_type) { } 532static inline void __init construct_ioapic_table(int mpc_default_type) { }
681#endif 533#endif
682 534
683static inline void __init construct_default_ISA_mptable(int mpc_default_type) 535static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 578static struct intel_mp_floating *mpf_found;
727 579
728/* 580/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
736 */ 582 */
737static void __init __get_smp_config(unsigned int early) 583static void __init __get_smp_config(unsigned int early)
738{ 584{
739 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
740 586
741 if (mach_get_smp_config_quirk) { 587 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 588 if (x86_quirks->mach_get_smp_config(early))
743 return; 589 return;
744 } 590 }
745 if (acpi_lapic && early) 591 if (acpi_lapic && early)
@@ -849,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
849 unsigned int *bp = phys_to_virt(base); 695 unsigned int *bp = phys_to_virt(base);
850 struct intel_mp_floating *mpf; 696 struct intel_mp_floating *mpf;
851 697
852 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); 698 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
699 bp, length);
853 BUILD_BUG_ON(sizeof(*mpf) != 16); 700 BUILD_BUG_ON(sizeof(*mpf) != 16);
854 701
855 while (length > 0) { 702 while (length > 0) {
@@ -899,14 +746,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 746 return 0;
900} 747}
901 748
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 749static void __init __find_smp_config(unsigned int reserve)
905{ 750{
906 unsigned int address; 751 unsigned int address;
907 752
908 if (mach_find_smp_config_quirk) { 753 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 754 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 755 return;
911 } 756 }
912 /* 757 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..2e2af5d18191 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
131 ret = -EIO; /* MSR not supported */ 145 ret = -EIO; /* MSR not supported */
132out: 146out:
133 unlock_kernel(); 147 unlock_kernel();
134 return 0; 148 return ret;
135} 149}
136 150
137/* 151/*
@@ -149,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
149{ 163{
150 struct device *dev; 164 struct device *dev;
151 165
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
153 "msr%d", cpu); 167 NULL, "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 169}
156 170
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..abb78a2cc4ad 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
114} 114}
115#endif 115#endif
116 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
117int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
118{ 135{
119 unsigned int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
141 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
142 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
143 continue; 160 continue;
144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
145 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
146 "appears to be stuck (%d->%d)!\n",
147 cpu,
148 prev_nmi_count[cpu],
149 get_nmi_count(cpu));
150 per_cpu(wd_enabled, cpu) = 0;
151 atomic_dec(&nmi_active);
152 }
153 } 163 }
154 endflag = 1; 164 endflag = 1;
155 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
@@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 273
264static void __acpi_nmi_enable(void *__unused) 274static void __acpi_nmi_enable(void *__unused)
265{ 275{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 276 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 277}
268 278
269/* 279/*
@@ -277,7 +287,7 @@ void acpi_nmi_enable(void)
277 287
278static void __acpi_nmi_disable(void *__unused) 288static void __acpi_nmi_disable(void *__unused)
279{ 289{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 290 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 291}
282 292
283/* 293/*
@@ -448,6 +458,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 458
449#ifdef CONFIG_SYSCTL 459#ifdef CONFIG_SYSCTL
450 460
461static int __init setup_unknown_nmi_panic(char *str)
462{
463 unknown_nmi_panic = 1;
464 return 1;
465}
466__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
467
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 468static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 469{
453 unsigned char reason = get_nmi_reason(); 470 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..eecc8c18f010 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __cpuinit numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
74static __init void early_check_numaq(void) 257static __init void early_check_numaq(void)
75{ 258{
76 /* 259 /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
82 */ 265 */
83 if (smp_found_config) 266 if (smp_found_config)
84 early_get_smp_config(); 267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
85} 271}
86 272
87int __init get_memcfg_numaq(void) 273int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 278 smp_dump_qct();
93 return 1; 279 return 1;
94} 280}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..300da17e61cb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
266 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
267} 269}
268 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
269struct pv_info pv_info = { 282struct pv_info pv_info = {
270 .name = "bare hardware", 283 .name = "bare hardware",
271 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
361struct pv_apic_ops pv_apic_ops = { 374struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 375#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write, 376 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read, 377 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 378 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 379 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 385#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 386 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 387 .pagetable_setup_done = native_pagetable_setup_done,
388#else
389 .pagetable_setup_start = paravirt_nop,
390 .pagetable_setup_done = paravirt_nop,
376#endif 391#endif
377 392
378 .read_cr2 = native_read_cr2, 393 .read_cr2 = native_read_cr2,
@@ -428,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
428#endif /* PAGETABLE_LEVELS >= 3 */ 443#endif /* PAGETABLE_LEVELS >= 3 */
429 444
430 .pte_val = native_pte_val, 445 .pte_val = native_pte_val,
431 .pte_flags = native_pte_val, 446 .pte_flags = native_pte_flags,
432 .pgd_val = native_pgd_val, 447 .pgd_val = native_pgd_val,
433 448
434 .make_pte = native_make_pte, 449 .make_pte = native_make_pte,
@@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
446 .set_fixmap = native_set_fixmap, 461 .set_fixmap = native_set_fixmap,
447}; 462};
448 463
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL(pv_lock_ops);
475
449EXPORT_SYMBOL_GPL(pv_time_ops); 476EXPORT_SYMBOL_GPL(pv_time_ops);
450EXPORT_SYMBOL (pv_cpu_ops); 477EXPORT_SYMBOL (pv_cpu_ops);
451EXPORT_SYMBOL (pv_mmu_ops); 478EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..dcdac6c826e9 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -339,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
339 /* were we called with bad_dma_address? */ 343 /* were we called with bad_dma_address? */
340 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 344 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
341 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 345 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
342 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 346 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
343 "address 0x%Lx\n", dma_addr); 347 "address 0x%Lx\n", dma_addr);
344 WARN_ON(1);
345 return; 348 return;
346 } 349 }
347 350
@@ -410,22 +413,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 413 }
411} 414}
412 415
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 416static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 417 int nelems, int direction)
431{ 418{
@@ -436,9 +423,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 423 unsigned long entry;
437 int i; 424 int i;
438 425
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 426 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 427 BUG_ON(!sg_page(s));
444 428
@@ -474,7 +458,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 458static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 459 size_t size, int direction)
476{ 460{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 461 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 462 unsigned long uaddr;
480 unsigned int npages; 463 unsigned int npages;
@@ -483,12 +466,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 466 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 467 npages = num_dma_pages(uaddr, size);
485 468
486 if (translation_enabled(tbl)) 469 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 470}
493 471
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 472static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +475,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 475 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 476 unsigned int npages;
499 477
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 478 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 479 iommu_free(tbl, dma_handle, npages);
505} 480}
@@ -522,18 +497,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
522 goto error; 497 goto error;
523 memset(ret, 0, size); 498 memset(ret, 0, size);
524 499
525 if (translation_enabled(tbl)) { 500 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 501 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 502 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 503 goto free;
529 goto free; 504 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 505 return ret;
536
537free: 506free:
538 free_pages((unsigned long)ret, get_order(size)); 507 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 508 ret = NULL;
@@ -541,7 +510,7 @@ error:
541 return ret; 510 return ret;
542} 511}
543 512
544static const struct dma_mapping_ops calgary_dma_ops = { 513static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 514 .alloc_coherent = calgary_alloc_coherent,
546 .map_single = calgary_map_single, 515 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 516 .unmap_single = calgary_unmap_single,
@@ -830,7 +799,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 799
831 tbl = pci_iommu(dev->bus); 800 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 801 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 802
803 if (is_kdump_kernel())
804 calgary_init_bitmap_from_tce_table(tbl);
805 else
806 tce_free(tbl, 0, tbl->it_size);
834 807
835 if (is_calgary(dev->device)) 808 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 809 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1182,10 @@ static int __init calgary_init(void)
1209 if (ret) 1182 if (ret)
1210 return ret; 1183 return ret;
1211 1184
1185 /* Purely for kdump kernel case */
1186 if (is_kdump_kernel())
1187 get_tce_space_from_tar();
1188
1212 do { 1189 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1190 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1191 if (!dev)
@@ -1230,6 +1207,16 @@ static int __init calgary_init(void)
1230 goto error; 1207 goto error;
1231 } while (1); 1208 } while (1);
1232 1209
1210 dev = NULL;
1211 for_each_pci_dev(dev) {
1212 struct iommu_table *tbl;
1213
1214 tbl = find_iommu_table(&dev->dev);
1215
1216 if (translation_enabled(tbl))
1217 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1218 }
1219
1233 return ret; 1220 return ret;
1234 1221
1235error: 1222error:
@@ -1251,6 +1238,7 @@ error:
1251 calgary_disable_translation(dev); 1238 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1239 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1240 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1241 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1242 } while (1);
1255 1243
1256 return ret; 1244 return ret;
@@ -1280,13 +1268,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1280static int __init build_detail_arrays(void) 1268static int __init build_detail_arrays(void)
1281{ 1269{
1282 unsigned long ptr; 1270 unsigned long ptr;
1283 int i, scal_detail_size, rio_detail_size; 1271 unsigned numnodes, i;
1272 int scal_detail_size, rio_detail_size;
1284 1273
1285 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1274 numnodes = rio_table_hdr->num_scal_dev;
1275 if (numnodes > MAX_NUMNODES){
1286 printk(KERN_WARNING 1276 printk(KERN_WARNING
1287 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1277 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1288 "but system has %d nodes.\n", 1278 "but system has %d nodes.\n",
1289 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1279 MAX_NUMNODES, numnodes);
1290 return -ENODEV; 1280 return -ENODEV;
1291 } 1281 }
1292 1282
@@ -1307,8 +1297,7 @@ static int __init build_detail_arrays(void)
1307 } 1297 }
1308 1298
1309 ptr = ((unsigned long)rio_table_hdr) + 3; 1299 ptr = ((unsigned long)rio_table_hdr) + 3;
1310 for (i = 0; i < rio_table_hdr->num_scal_dev; 1300 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1311 i++, ptr += scal_detail_size)
1312 scal_devs[i] = (struct scal_detail *)ptr; 1301 scal_devs[i] = (struct scal_detail *)ptr;
1313 1302
1314 for (i = 0; i < rio_table_hdr->num_rio_dev; 1303 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1328 return (val != 0xffffffff);
1340} 1329}
1341 1330
1331/*
1332 * calgary_init_bitmap_from_tce_table():
1333 * Funtion for kdump case. In the second/kdump kernel initialize
1334 * the bitmap based on the tce table entries obtained from first kernel
1335 */
1336static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1337{
1338 u64 *tp;
1339 unsigned int index;
1340 tp = ((u64 *)tbl->it_base);
1341 for (index = 0 ; index < tbl->it_size; index++) {
1342 if (*tp != 0x0)
1343 set_bit(index, tbl->it_map);
1344 tp++;
1345 }
1346}
1347
1348/*
1349 * get_tce_space_from_tar():
1350 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu
1352 */
1353static void __init get_tce_space_from_tar(void)
1354{
1355 int bus;
1356 void __iomem *target;
1357 unsigned long tce_space;
1358
1359 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1360 struct calgary_bus_info *info = &bus_info[bus];
1361 unsigned short pci_device;
1362 u32 val;
1363
1364 val = read_pci_config(bus, 0, 0, 0);
1365 pci_device = (val & 0xFFFF0000) >> 16;
1366
1367 if (!is_cal_pci_dev(pci_device))
1368 continue;
1369 if (info->translation_disabled)
1370 continue;
1371
1372 if (calgary_bus_has_devices(bus, pci_device) ||
1373 translate_empty_slots) {
1374 target = calgary_reg(bus_info[bus].bbar,
1375 tar_offset(bus));
1376 tce_space = be64_to_cpu(readq(target));
1377 tce_space = tce_space & TAR_SW_BITS;
1378
1379 tce_space = tce_space & (~specified_table_size);
1380 info->tce_space = (u64 *)__va(tce_space);
1381 }
1382 }
1383 return;
1384}
1385
1342void __init detect_calgary(void) 1386void __init detect_calgary(void)
1343{ 1387{
1344 int bus; 1388 int bus;
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void)
1394 return; 1438 return;
1395 } 1439 }
1396 1440
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1441 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1442 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1443
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1444 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1445 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void)
1412 1457
1413 if (calgary_bus_has_devices(bus, pci_device) || 1458 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1459 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1460 /*
1416 if (!tbl) 1461 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1462 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1463 */
1464 if (!is_kdump_kernel()) {
1465 tbl = alloc_tce_table();
1466 if (!tbl)
1467 goto cleanup;
1468 info->tce_space = tbl;
1469 }
1419 calgary_found = 1; 1470 calgary_found = 1;
1420 } 1471 }
1421 } 1472 }
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1481 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1482 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1483 debugging ? "enabled" : "disabled");
1484
1485 /* swiotlb for devices that aren't behind the Calgary. */
1486 if (max_pfn > MAX_DMA32_PFN)
1487 swiotlb = 1;
1433 } 1488 }
1434 return; 1489 return;
1435 1490
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void)
1446{ 1501{
1447 int ret; 1502 int ret;
1448 1503
1449 if (no_iommu || swiotlb) 1504 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1505 return -ENODEV;
1451 1506
1452 if (!calgary_detected) 1507 if (!calgary_detected)
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1514 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1515 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1516 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1517 return ret;
1466 } 1518 }
1467 1519
1468 force_iommu = 1; 1520 force_iommu = 1;
1469 bad_dma_address = 0x0; 1521 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1522 /* dma_ops is set to swiotlb or nommu */
1523 if (!dma_ops)
1524 dma_ops = &nommu_dma_ops;
1471 1525
1472 return 0; 1526 return 0;
1473} 1527}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..87d4d6964ec2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,13 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13EXPORT_SYMBOL(forbid_dac);
14 13
15const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
17 16
18static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -114,22 +113,24 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 113 * The order of these functions is important for
115 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
116 */ 115 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
119#endif
120 117
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 118 detect_calgary();
123#endif
124 119
125 detect_intel_iommu(); 120 detect_intel_iommu();
126 121
127 amd_iommu_detect(); 122 amd_iommu_detect();
128 123
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 124 pci_swiotlb_init();
131#endif
132} 125}
126
127unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
128{
129 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
130
131 return size >> PAGE_SHIFT;
132}
133EXPORT_SYMBOL(iommu_num_pages);
133#endif 134#endif
134 135
135/* 136/*
@@ -184,9 +185,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 185 swiotlb = 1;
185#endif 186#endif
186 187
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 188 gart_parse_options(p);
189#endif
190 189
191#ifdef CONFIG_CALGARY_IOMMU 190#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 191 if (!strncmp(p, "calgary", 7))
@@ -201,136 +200,19 @@ static __init int iommu_setup(char *p)
201} 200}
202early_param("iommu", iommu_setup); 201early_param("iommu", iommu_setup);
203 202
204#ifdef CONFIG_X86_32
205int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
206 dma_addr_t device_addr, size_t size, int flags)
207{
208 void __iomem *mem_base = NULL;
209 int pages = size >> PAGE_SHIFT;
210 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
211
212 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
213 goto out;
214 if (!size)
215 goto out;
216 if (dev->dma_mem)
217 goto out;
218
219 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
220
221 mem_base = ioremap(bus_addr, size);
222 if (!mem_base)
223 goto out;
224
225 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
226 if (!dev->dma_mem)
227 goto out;
228 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
229 if (!dev->dma_mem->bitmap)
230 goto free1_out;
231
232 dev->dma_mem->virt_base = mem_base;
233 dev->dma_mem->device_base = device_addr;
234 dev->dma_mem->size = pages;
235 dev->dma_mem->flags = flags;
236
237 if (flags & DMA_MEMORY_MAP)
238 return DMA_MEMORY_MAP;
239
240 return DMA_MEMORY_IO;
241
242 free1_out:
243 kfree(dev->dma_mem);
244 out:
245 if (mem_base)
246 iounmap(mem_base);
247 return 0;
248}
249EXPORT_SYMBOL(dma_declare_coherent_memory);
250
251void dma_release_declared_memory(struct device *dev)
252{
253 struct dma_coherent_mem *mem = dev->dma_mem;
254
255 if (!mem)
256 return;
257 dev->dma_mem = NULL;
258 iounmap(mem->virt_base);
259 kfree(mem->bitmap);
260 kfree(mem);
261}
262EXPORT_SYMBOL(dma_release_declared_memory);
263
264void *dma_mark_declared_memory_occupied(struct device *dev,
265 dma_addr_t device_addr, size_t size)
266{
267 struct dma_coherent_mem *mem = dev->dma_mem;
268 int pos, err;
269 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
270
271 pages >>= PAGE_SHIFT;
272
273 if (!mem)
274 return ERR_PTR(-EINVAL);
275
276 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
277 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
278 if (err != 0)
279 return ERR_PTR(err);
280 return mem->virt_base + (pos << PAGE_SHIFT);
281}
282EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
283
284static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
285 dma_addr_t *dma_handle, void **ret)
286{
287 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
288 int order = get_order(size);
289
290 if (mem) {
291 int page = bitmap_find_free_region(mem->bitmap, mem->size,
292 order);
293 if (page >= 0) {
294 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
295 *ret = mem->virt_base + (page << PAGE_SHIFT);
296 memset(*ret, 0, size);
297 }
298 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
299 *ret = NULL;
300 }
301 return (mem != NULL);
302}
303
304static int dma_release_coherent(struct device *dev, int order, void *vaddr)
305{
306 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
307
308 if (mem && vaddr >= mem->virt_base && vaddr <
309 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
310 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
311
312 bitmap_release_region(mem->bitmap, page, order);
313 return 1;
314 }
315 return 0;
316}
317#else
318#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
319#define dma_release_coherent(dev, order, vaddr) (0)
320#endif /* CONFIG_X86_32 */
321
322int dma_supported(struct device *dev, u64 mask) 203int dma_supported(struct device *dev, u64 mask)
323{ 204{
205 struct dma_mapping_ops *ops = get_dma_ops(dev);
206
324#ifdef CONFIG_PCI 207#ifdef CONFIG_PCI
325 if (mask > 0xffffffff && forbid_dac > 0) { 208 if (mask > 0xffffffff && forbid_dac > 0) {
326 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 209 dev_info(dev, "PCI: Disallowing DAC for device\n");
327 dev->bus_id);
328 return 0; 210 return 0;
329 } 211 }
330#endif 212#endif
331 213
332 if (dma_ops->dma_supported) 214 if (ops->dma_supported)
333 return dma_ops->dma_supported(dev, mask); 215 return ops->dma_supported(dev, mask);
334 216
335 /* Copied from i386. Doesn't make much sense, because it will 217 /* Copied from i386. Doesn't make much sense, because it will
336 only work for pci_alloc_coherent. 218 only work for pci_alloc_coherent.
@@ -351,8 +233,7 @@ int dma_supported(struct device *dev, u64 mask)
351 type. Normally this doesn't make any difference, but gives 233 type. Normally this doesn't make any difference, but gives
352 more gentle handling of IOMMU overflow. */ 234 more gentle handling of IOMMU overflow. */
353 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 235 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
354 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 236 dev_info(dev, "Force SAC with mask %Lx\n", mask);
355 dev->bus_id, mask);
356 return 0; 237 return 0;
357 } 238 }
358 239
@@ -378,6 +259,7 @@ void *
378dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
379 gfp_t gfp) 260 gfp_t gfp)
380{ 261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
381 void *memory = NULL; 263 void *memory = NULL;
382 struct page *page; 264 struct page *page;
383 unsigned long dma_mask = 0; 265 unsigned long dma_mask = 0;
@@ -387,7 +269,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
387 /* ignore region specifiers */ 269 /* ignore region specifiers */
388 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
389 271
390 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) 272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
391 return memory; 273 return memory;
392 274
393 if (!dev) { 275 if (!dev) {
@@ -446,8 +328,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
446 /* Let low level make its own zone decisions */ 328 /* Let low level make its own zone decisions */
447 gfp &= ~(GFP_DMA32|GFP_DMA); 329 gfp &= ~(GFP_DMA32|GFP_DMA);
448 330
449 if (dma_ops->alloc_coherent) 331 if (ops->alloc_coherent)
450 return dma_ops->alloc_coherent(dev, size, 332 return ops->alloc_coherent(dev, size,
451 dma_handle, gfp); 333 dma_handle, gfp);
452 return NULL; 334 return NULL;
453 } 335 }
@@ -459,14 +341,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
459 } 341 }
460 } 342 }
461 343
462 if (dma_ops->alloc_coherent) { 344 if (ops->alloc_coherent) {
463 free_pages((unsigned long)memory, get_order(size)); 345 free_pages((unsigned long)memory, get_order(size));
464 gfp &= ~(GFP_DMA|GFP_DMA32); 346 gfp &= ~(GFP_DMA|GFP_DMA32);
465 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); 347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
466 } 348 }
467 349
468 if (dma_ops->map_simple) { 350 if (ops->map_simple) {
469 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), 351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
470 size, 352 size,
471 PCI_DMA_BIDIRECTIONAL); 353 PCI_DMA_BIDIRECTIONAL);
472 if (*dma_handle != bad_dma_address) 354 if (*dma_handle != bad_dma_address)
@@ -488,29 +370,27 @@ EXPORT_SYMBOL(dma_alloc_coherent);
488void dma_free_coherent(struct device *dev, size_t size, 370void dma_free_coherent(struct device *dev, size_t size,
489 void *vaddr, dma_addr_t bus) 371 void *vaddr, dma_addr_t bus)
490{ 372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
491 int order = get_order(size); 375 int order = get_order(size);
492 WARN_ON(irqs_disabled()); /* for portability */ 376 WARN_ON(irqs_disabled()); /* for portability */
493 if (dma_release_coherent(dev, order, vaddr)) 377 if (dma_release_from_coherent(dev, order, vaddr))
494 return; 378 return;
495 if (dma_ops->unmap_single) 379 if (ops->unmap_single)
496 dma_ops->unmap_single(dev, bus, size, 0); 380 ops->unmap_single(dev, bus, size, 0);
497 free_pages((unsigned long)vaddr, order); 381 free_pages((unsigned long)vaddr, order);
498} 382}
499EXPORT_SYMBOL(dma_free_coherent); 383EXPORT_SYMBOL(dma_free_coherent);
500 384
501static int __init pci_iommu_init(void) 385static int __init pci_iommu_init(void)
502{ 386{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 387 calgary_iommu_init();
505#endif
506 388
507 intel_iommu_init(); 389 intel_iommu_init();
508 390
509 amd_iommu_init(); 391 amd_iommu_init();
510 392
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 393 gart_iommu_init();
513#endif
514 394
515 no_iommu_init(); 395 no_iommu_init();
516 return 0; 396 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..49285f8fd4d5 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry;
66 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
67#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
68 69
69#define to_pages(addr, size) \
70 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
71
72#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
73 71
74#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -197,9 +195,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
197 * out. Hopefully no network devices use single mappings that big. 195 * out. Hopefully no network devices use single mappings that big.
198 */ 196 */
199 197
200 printk(KERN_ERR 198 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
202 size, dev->bus_id);
203 199
204 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 200 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 201 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -242,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
242static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
243 size_t size, int dir) 239 size_t size, int dir)
244{ 240{
245 unsigned long npages = to_pages(phys_mem, size); 241 unsigned long npages = iommu_num_pages(phys_mem, size);
246 unsigned long iommu_page = alloc_iommu(dev, npages); 242 unsigned long iommu_page = alloc_iommu(dev, npages);
247 int i; 243 int i;
248 244
@@ -305,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
305 return; 301 return;
306 302
307 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 303 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
308 npages = to_pages(dma_addr, size); 304 npages = iommu_num_pages(dma_addr, size);
309 for (i = 0; i < npages; i++) { 305 for (i = 0; i < npages; i++) {
310 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 306 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
311 CLEAR_LEAK(iommu_page + i); 307 CLEAR_LEAK(iommu_page + i);
@@ -388,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
388 } 384 }
389 385
390 addr = phys_addr; 386 addr = phys_addr;
391 pages = to_pages(s->offset, s->length); 387 pages = iommu_num_pages(s->offset, s->length);
392 while (pages--) { 388 while (pages--) {
393 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 389 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
394 SET_LEAK(iommu_page); 390 SET_LEAK(iommu_page);
@@ -471,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
471 467
472 seg_size += s->length; 468 seg_size += s->length;
473 need = nextneed; 469 need = nextneed;
474 pages += to_pages(s->offset, s->length); 470 pages += iommu_num_pages(s->offset, s->length);
475 ps = s; 471 ps = s;
476 } 472 }
477 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 473 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -693,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
693 689
694extern int agp_amd64_init(void); 690extern int agp_amd64_init(void);
695 691
696static const struct dma_mapping_ops gart_dma_ops = { 692static struct dma_mapping_ops gart_dma_ops = {
697 .mapping_error = NULL,
698 .map_single = gart_map_single, 693 .map_single = gart_map_single,
699 .map_simple = gart_map_simple, 694 .map_simple = gart_map_simple,
700 .unmap_single = gart_unmap_single, 695 .unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75struct dma_mapping_ops nommu_dma_ops = {
76static int nommu_mapping_error(dma_addr_t dma_addr)
77{
78#ifdef CONFIG_X86_32
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85
86const struct dma_mapping_ops nommu_dma_ops = {
87 .map_single = nommu_map_single, 76 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 77 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 78 .is_phys = 1,
91}; 79};
92 80
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..876e91890777 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
18 19
19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
20{ 21{
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
244 return 1; 246 return 1;
245} 247}
246 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
247/* 257/*
248 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
249 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
251 */ 261 */
252static void c1e_idle(void) 262static void c1e_idle(void)
253{ 263{
254 static cpumask_t c1e_mask = CPU_MASK_NONE;
255 static int c1e_detected;
256
257 if (need_resched()) 264 if (need_resched())
258 return; 265 return;
259 266
@@ -263,8 +270,10 @@ static void c1e_idle(void)
263 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
264 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
265 c1e_detected = 1; 272 c1e_detected = 1;
266 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
267 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
268 } 277 }
269 } 278 }
270 279
@@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 335
327static int __init idle_setup(char *str) 336static int __init idle_setup(char *str)
328{ 337{
338 if (!str)
339 return -EINVAL;
340
329 if (!strcmp(str, "poll")) { 341 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 342 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 343 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..31f40b24bf5d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
55#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
56#include <asm/cpu.h> 56#include <asm/cpu.h>
57#include <asm/kdebug.h> 57#include <asm/kdebug.h>
58#include <asm/idle.h>
58 59
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 61
@@ -88,6 +89,7 @@ static void cpu_exit_clear(void)
88 cpu_clear(cpu, cpu_callin_map); 89 cpu_clear(cpu, cpu_callin_map);
89 90
90 numa_remove_cpu(cpu); 91 numa_remove_cpu(cpu);
92 c1e_remove_cpu(cpu);
91} 93}
92 94
93/* We don't actually take CPU down, just spin without interrupts. */ 95/* We don't actually take CPU down, just spin without interrupts. */
@@ -95,7 +97,6 @@ static inline void play_dead(void)
95{ 97{
96 /* This must be done before dead CPU ack */ 98 /* This must be done before dead CPU ack */
97 cpu_exit_clear(); 99 cpu_exit_clear();
98 wbinvd();
99 mb(); 100 mb();
100 /* Ack it */ 101 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD; 102 __get_cpu_var(cpu_state) = CPU_DEAD;
@@ -104,8 +105,8 @@ static inline void play_dead(void)
104 * With physical CPU hotplug, we should halt the cpu 105 * With physical CPU hotplug, we should halt the cpu
105 */ 106 */
106 local_irq_disable(); 107 local_irq_disable();
107 while (1) 108 /* mask all interrupts, flush any and all caches, and halt */
108 halt(); 109 wbinvd_halt();
109} 110}
110#else 111#else
111static inline void play_dead(void) 112static inline void play_dead(void)
@@ -128,7 +129,7 @@ void cpu_idle(void)
128 129
129 /* endless idle loop with no priority at all */ 130 /* endless idle loop with no priority at all */
130 while (1) { 131 while (1) {
131 tick_nohz_stop_sched_tick(); 132 tick_nohz_stop_sched_tick(1);
132 while (!need_resched()) { 133 while (!need_resched()) {
133 134
134 check_pgt_cache(); 135 check_pgt_cache();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..e12e0e4dd256 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,14 +93,15 @@ DECLARE_PER_CPU(int, cpu_state);
93static inline void play_dead(void) 93static inline void play_dead(void)
94{ 94{
95 idle_task_exit(); 95 idle_task_exit();
96 wbinvd(); 96 c1e_remove_cpu(raw_smp_processor_id());
97
97 mb(); 98 mb();
98 /* Ack it */ 99 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD; 100 __get_cpu_var(cpu_state) = CPU_DEAD;
100 101
101 local_irq_disable(); 102 local_irq_disable();
102 while (1) 103 /* mask all interrupts, flush any and all caches, and halt */
103 halt(); 104 wbinvd_halt();
104} 105}
105#else 106#else
106static inline void play_dead(void) 107static inline void play_dead(void)
@@ -120,7 +121,7 @@ void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING; 121 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */ 122 /* endless idle loop with no priority at all */
122 while (1) { 123 while (1) {
123 tick_nohz_stop_sched_tick(); 124 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) { 125 while (!need_resched()) {
125 126
126 rmb(); 127 rmb();
@@ -537,8 +538,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 538struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 539__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 540{
540 struct thread_struct *prev = &prev_p->thread, 541 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 542 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 543 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 544 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 545 unsigned fsindex, gsindex;
@@ -586,35 +587,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 587
587 /* 588 /*
588 * Switch FS and GS. 589 * Switch FS and GS.
590 *
591 * Segment register != 0 always requires a reload. Also
592 * reload when it has changed. When prev process used 64bit
593 * base always reload to avoid an information leak.
589 */ 594 */
590 { 595 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 596 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 597 /*
593 when prev process used 64bit base always reload 598 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 599 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 600 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 601 */
597 /* check if the user used a selector != 0 602 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 603 prev->fs = 0;
603 } 604 }
604 /* when next process has a 64bit base use it */ 605 /* when next process has a 64bit base use it */
605 if (next->fs) 606 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 607 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 608 prev->fsindex = fsindex;
608 609
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 610 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 611 load_gs_index(next->gsindex);
611 if (gsindex) 612 if (gsindex)
612 prev->gs = 0; 613 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 614 }
615 if (next->gs)
616 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
617 prev->gsindex = gsindex;
618 618
619 /* Must be after DS reload */ 619 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 620 unlazy_fpu(prev_p);
@@ -627,7 +627,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 627 write_pda(pcurrent, next_p);
628 628
629 write_pda(kernelstack, 629 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 630 (unsigned long)task_stack_page(next_p) +
631 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 632#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 633 write_pda(stack_canary, next_p->stack_canary);
633 /* 634 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..e37dccce85db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..724adfc63cb9 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
@@ -403,10 +411,9 @@ void native_machine_shutdown(void)
403{ 411{
404 /* Stop the cpus and apics */ 412 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 414
408 /* The boot cpu is always logical cpu 0 */ 415 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 416 int reboot_cpu_id = 0;
410 417
411#ifdef CONFIG_X86_32 418#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 419 /* See if there has been given a command line override */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define ESP DATA(0x0)
31#define CR0 DATA(0x4)
32#define CR3 DATA(0x8)
33#define CR4 DATA(0xc)
34
35/* other data */
36#define CP_VA_CONTROL_PAGE DATA(0x10)
37#define CP_PA_PGD DATA(0x14)
38#define CP_PA_SWAP_PAGE DATA(0x18)
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40
23 .text 41 .text
24 .align PAGE_SIZE 42 .align PAGE_SIZE
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 45 /* Save the CPU context, used for jumping back */
46
47 pushl %ebx
48 pushl %esi
49 pushl %edi
50 pushl %ebp
51 pushf
52
53 movl 20+8(%esp), %ebp /* list of pages */
54 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
55 movl %esp, ESP(%edi)
56 movl %cr0, %eax
57 movl %eax, CR0(%edi)
58 movl %cr3, %eax
59 movl %eax, CR3(%edi)
60 movl %cr4, %eax
61 movl %eax, CR4(%edi)
28 62
29#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 64 /* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
138 172
139relocate_new_kernel: 173relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 174 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 175 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 176 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 177 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 178 movl 20+16(%esp), %ecx /* cpu_has_pae */
179 movl 20+20(%esp), %esi /* preserve_context */
145 180
146 /* zero out flags, and disable interrupts */ 181 /* zero out flags, and disable interrupts */
147 pushl $0 182 pushl $0
148 popfl 183 popfl
149 184
185 /* save some information for jumping back */
186 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
187 movl %edi, CP_VA_CONTROL_PAGE(%edi)
188 movl PTR(PA_PGD)(%ebp), %eax
189 movl %eax, CP_PA_PGD(%edi)
190 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
191 movl %eax, CP_PA_SWAP_PAGE(%edi)
192 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
193
150 /* get physical address of control page now */ 194 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 195 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 196 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
197 xorl %eax, %eax 241 xorl %eax, %eax
198 movl %eax, %cr3 242 movl %eax, %cr3
199 243
244 movl CP_PA_SWAP_PAGE(%edi), %eax
245 pushl %eax
246 pushl %ebx
247 call swap_pages
248 addl $8, %esp
249
250 /* To be certain of avoiding problems with self-modifying code
251 * I need to execute a serializing instruction here.
252 * So I flush the TLB, it's handy, and not processor dependent.
253 */
254 xorl %eax, %eax
255 movl %eax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %esp alone */
259
260 testl %esi, %esi
261 jnz 1f
262 xorl %edi, %edi
263 xorl %eax, %eax
264 xorl %ebx, %ebx
265 xorl %ecx, %ecx
266 xorl %edx, %edx
267 xorl %esi, %esi
268 xorl %ebp, %ebp
269 ret
2701:
271 popl %edx
272 movl CP_PA_SWAP_PAGE(%edi), %esp
273 addl $PAGE_SIZE, %esp
2742:
275 call *%edx
276
277 /* get the re-entry point of the peer system */
278 movl 0(%esp), %ebp
279 call 1f
2801:
281 popl %ebx
282 subl $(1b - relocate_kernel), %ebx
283 movl CP_VA_CONTROL_PAGE(%ebx), %edi
284 lea PAGE_SIZE(%ebx), %esp
285 movl CP_PA_SWAP_PAGE(%ebx), %eax
286 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
287 pushl %eax
288 pushl %edx
289 call swap_pages
290 addl $8, %esp
291 movl CP_PA_PGD(%ebx), %eax
292 movl %eax, %cr3
293 movl %cr0, %eax
294 orl $(1<<31), %eax
295 movl %eax, %cr0
296 lea PAGE_SIZE(%edi), %esp
297 movl %edi, %eax
298 addl $(virtual_mapped - relocate_kernel), %eax
299 pushl %eax
300 ret
301
302virtual_mapped:
303 movl CR4(%edi), %eax
304 movl %eax, %cr4
305 movl CR3(%edi), %eax
306 movl %eax, %cr3
307 movl CR0(%edi), %eax
308 movl %eax, %cr0
309 movl ESP(%edi), %esp
310 movl %ebp, %eax
311
312 popf
313 popl %ebp
314 popl %edi
315 popl %esi
316 popl %ebx
317 ret
318
200 /* Do the copies */ 319 /* Do the copies */
201 movl %ebx, %ecx 320swap_pages:
321 movl 8(%esp), %edx
322 movl 4(%esp), %ecx
323 pushl %ebp
324 pushl %ebx
325 pushl %edi
326 pushl %esi
327 movl %ecx, %ebx
202 jmp 1f 328 jmp 1f
203 329
2040: /* top, read another word from the indirection page */ 3300: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 352 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 353 andl $0xfffff000, %esi
228 354
355 movl %edi, %eax
356 movl %esi, %ebp
357
358 movl %edx, %edi
229 movl $1024, %ecx 359 movl $1024, %ecx
230 rep ; movsl 360 rep ; movsl
231 jmp 0b
232
2333:
234 361
235 /* To be certain of avoiding problems with self-modifying code 362 movl %ebp, %edi
236 * I need to execute a serializing instruction here. 363 movl %eax, %esi
237 * So I flush the TLB, it's handy, and not processor dependent. 364 movl $1024, %ecx
238 */ 365 rep ; movsl
239 xorl %eax, %eax
240 movl %eax, %cr3
241 366
242 /* set all of the registers to known values */ 367 movl %eax, %edi
243 /* leave %esp alone */ 368 movl %edx, %esi
369 movl $1024, %ecx
370 rep ; movsl
244 371
245 xorl %eax, %eax 372 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 373 jmp 0b
247 xorl %ecx, %ecx 3743:
248 xorl %edx, %edx 375 popl %esi
249 xorl %esi, %esi 376 popl %edi
250 xorl %edi, %edi 377 popl %ebx
251 xorl %ebp, %ebp 378 popl %ebp
252 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..9838f2539dfc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -450,7 +445,7 @@ static void __init reserve_early_setup_data(void)
450 * @size: Size of the crashkernel memory to reserve. 445 * @size: Size of the crashkernel memory to reserve.
451 * Returns the base address on success, and -1ULL on failure. 446 * Returns the base address on success, and -1ULL on failure.
452 */ 447 */
453unsigned long long find_and_reserve_crashkernel(unsigned long long size) 448unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
454{ 449{
455 const unsigned long long alignment = 16<<20; /* 16M */ 450 const unsigned long long alignment = 16<<20; /* 16M */
456 unsigned long long start = 0LL; 451 unsigned long long start = 0LL;
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 574early_param("elfcorehdr", setup_elfcorehdr);
580#endif 575#endif
581 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
582/* 581/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 582 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 583 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +597,11 @@ void __init setup_arch(char **cmdline_p)
598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect(); 598 visws_early_detect();
600 pre_setup_arch_hook(); 599 pre_setup_arch_hook();
601 early_cpu_init();
602#else 600#else
603 printk(KERN_INFO "Command line: %s\n", boot_command_line); 601 printk(KERN_INFO "Command line: %s\n", boot_command_line);
604#endif 602#endif
605 603
604 early_cpu_init();
606 early_ioremap_init(); 605 early_ioremap_init();
607 606
608 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 607 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,14 +665,23 @@ void __init setup_arch(char **cmdline_p)
666 bss_resource.start = virt_to_phys(&__bss_start); 665 bss_resource.start = virt_to_phys(&__bss_start);
667 bss_resource.end = virt_to_phys(&__bss_stop)-1; 666 bss_resource.end = virt_to_phys(&__bss_stop)-1;
668 667
669#ifdef CONFIG_X86_64
670 early_cpu_init();
671#endif
672 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
673 *cmdline_p = command_line; 669 *cmdline_p = command_line;
674 670
675 parse_early_param(); 671 parse_early_param();
676 672
673#ifdef CONFIG_X86_64
674 check_efer();
675#endif
676
677#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
678 /*
679 * Must be before kernel pagetables are setup
680 * or fixmap area is touched.
681 */
682 vmi_init();
683#endif
684
677 /* after early param, so could get panic from serial */ 685 /* after early param, so could get panic from serial */
678 reserve_early_setup_data(); 686 reserve_early_setup_data();
679 687
@@ -681,7 +689,7 @@ void __init setup_arch(char **cmdline_p)
681#ifdef CONFIG_X86_LOCAL_APIC 689#ifdef CONFIG_X86_LOCAL_APIC
682 disable_apic = 1; 690 disable_apic = 1;
683#endif 691#endif
684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 692 setup_clear_cpu_cap(X86_FEATURE_APIC);
685 } 693 }
686 694
687#ifdef CONFIG_PCI 695#ifdef CONFIG_PCI
@@ -734,7 +742,6 @@ void __init setup_arch(char **cmdline_p)
734#else 742#else
735 num_physpages = max_pfn; 743 num_physpages = max_pfn;
736 744
737 check_efer();
738 745
739 /* How many end-of-memory variables you have, grandma! */ 746 /* How many end-of-memory variables you have, grandma! */
740 /* need this before calling reserve_initrd */ 747 /* need this before calling reserve_initrd */
@@ -792,10 +799,6 @@ void __init setup_arch(char **cmdline_p)
792 799
793 initmem_init(0, max_pfn); 800 initmem_init(0, max_pfn);
794 801
795#ifdef CONFIG_X86_64
796 dma32_reserve_bootmem();
797#endif
798
799#ifdef CONFIG_ACPI_SLEEP 802#ifdef CONFIG_ACPI_SLEEP
800 /* 803 /*
801 * Reserve low memory region for sleep support. 804 * Reserve low memory region for sleep support.
@@ -810,21 +813,25 @@ void __init setup_arch(char **cmdline_p)
810#endif 813#endif
811 reserve_crashkernel(); 814 reserve_crashkernel();
812 815
816#ifdef CONFIG_X86_64
817 /*
818 * dma32_reserve_bootmem() allocates bootmem which may conflict
819 * with the crashkernel command line, so do that after
820 * reserve_crashkernel()
821 */
822 dma32_reserve_bootmem();
823#endif
824
813 reserve_ibft_region(); 825 reserve_ibft_region();
814 826
815#ifdef CONFIG_KVM_CLOCK 827#ifdef CONFIG_KVM_CLOCK
816 kvmclock_init(); 828 kvmclock_init();
817#endif 829#endif
818 830
819#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 831 paravirt_pagetable_setup_start(swapper_pg_dir);
820 /*
821 * Must be after max_low_pfn is determined, and before kernel
822 * pagetables are setup.
823 */
824 vmi_init();
825#endif
826
827 paging_init(); 832 paging_init();
833 paravirt_pagetable_setup_done(swapper_pg_dir);
834 paravirt_post_allocator_init();
828 835
829#ifdef CONFIG_X86_64 836#ifdef CONFIG_X86_64
830 map_vsyscall(); 837 map_vsyscall();
@@ -854,23 +861,9 @@ void __init setup_arch(char **cmdline_p)
854 init_cpu_to_node(); 861 init_cpu_to_node();
855#endif 862#endif
856 863
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 864 init_apic_mappings();
866 ioapic_init_mappings(); 865 ioapic_init_mappings();
867 866
868#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
869 if (def_to_bigsmp)
870 printk(KERN_WARNING "More than 8 CPUs detected and "
871 "CONFIG_X86_PC cannot handle it.\nUse "
872 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
873#endif
874 kvm_guest_init(); 867 kvm_guest_init();
875 868
876 e820_reserve_resources(); 869 e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..76e305e064f9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
80#endif 80#endif
81} 81}
82 82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
102/* 84/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it 85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -197,9 +179,6 @@ void __init setup_per_cpu_areas(void)
197 179
198 /* Setup node to cpumask map */ 180 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map(); 181 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203} 182}
204 183
205#endif 184#endif
@@ -227,8 +206,8 @@ static void __init setup_node_to_cpumask_map(void)
227 /* allocate the map */ 206 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 207 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229 208
230 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 209 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids); 210 map, nr_node_ids);
232 211
233 /* node_to_cpumask() will now work */ 212 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map; 213 node_to_cpumask_map = map;
@@ -248,7 +227,7 @@ void __cpuinit numa_set_node(int cpu, int node)
248 per_cpu(x86_cpu_to_node_map, cpu) = node; 227 per_cpu(x86_cpu_to_node_map, cpu) = node;
249 228
250 else 229 else
251 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); 230 pr_debug("Setting node for non-present cpu %d\n", cpu);
252} 231}
253 232
254void __cpuinit numa_clear_node(int cpu) 233void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..6fb5bcdd8933 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
669 663
670 if (thread_info_flags & _TIF_HRTICK_RESCHED)
671 hrtick_resched();
672
673 clear_thread_flag(TIF_IRET); 664 clear_thread_flag(TIF_IRET);
674} 665}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..ca316b5b742c 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -53,6 +53,68 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 53 return do_sigaltstack(uss, uoss, regs->sp);
54} 54}
55 55
56/*
57 * Signal frame handlers.
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87}
88
89/*
90 * This restores directly out of user space. Exceptions are handled.
91 */
92static inline int restore_i387(struct _fpstate __user *buf)
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108 if (unlikely(err)) {
109 /*
110 * Encountered an error while doing the restore from the
111 * user buffer, clear the fpu state.
112 */
113 clear_fpu(tsk);
114 clear_used_math();
115 }
116 return err;
117}
56 118
57/* 119/*
58 * Do a signal return; undo the signal stack. 120 * Do a signal return; undo the signal stack.
@@ -487,12 +549,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 549void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 550 __u32 thread_info_flags)
489{ 551{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 552#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 553 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 554 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -502,9 +558,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
502 /* deal with pending signal delivery */ 558 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 559 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 560 do_signal(regs);
505
506 if (thread_info_flags & _TIF_HRTICK_RESCHED)
507 hrtick_resched();
508} 561}
509 562
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 563void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..7985c5b3f916 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void)
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
217 phys_id, cpuid); 217 phys_id, cpuid);
218 } 218 }
219 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 219 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
220 220
221 /* 221 /*
222 * STARTUP IPIs are fragile beasts as they might sometimes 222 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void)
251 * boards) 251 * boards)
252 */ 252 */
253 253
254 Dprintk("CALLIN, before setup_local_APIC().\n"); 254 pr_debug("CALLIN, before setup_local_APIC().\n");
255 smp_callin_clear_local_apic(); 255 smp_callin_clear_local_apic();
256 setup_local_APIC(); 256 setup_local_APIC();
257 end_local_APIC_setup(); 257 end_local_APIC_setup();
@@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void)
266 local_irq_enable(); 266 local_irq_enable();
267 calibrate_delay(); 267 calibrate_delay();
268 local_irq_disable(); 268 local_irq_disable();
269 Dprintk("Stack at about %p\n", &cpuid); 269 pr_debug("Stack at about %p\n", &cpuid);
270 270
271 /* 271 /*
272 * Save our processor parameters 272 * Save our processor parameters
@@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused)
326 * for which cpus receive the IPI. Holding this 326 * for which cpus receive the IPI. Holding this
327 * lock helps us to not include this cpu in a currently in progress 327 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 328 * smp_call_function().
329 *
330 * We need to hold vector_lock so there the set of online cpus
331 * does not change while we are assigning vectors to cpus. Holding
332 * this lock ensures we don't half assign or remove an irq from a cpu.
329 */ 333 */
330 ipi_call_lock_irq(); 334 ipi_call_lock_irq();
331#ifdef CONFIG_X86_IO_APIC 335 lock_vector_lock();
332 setup_vector_irq(smp_processor_id()); 336 __setup_vector_irq(smp_processor_id());
333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 337 cpu_set(smp_processor_id(), cpu_online_map);
338 unlock_vector_lock();
335 ipi_call_unlock_irq(); 339 ipi_call_unlock_irq();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 340 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 341
@@ -438,7 +442,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 442 cpu_set(cpu, cpu_sibling_setup_map);
439 443
440 if (smp_num_siblings > 1) { 444 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 445 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 446 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 447 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 448 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +465,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 465 return;
462 } 466 }
463 467
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 468 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 469 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 470 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 471 cpu_set(i, c->llc_shared_map);
@@ -513,7 +517,7 @@ static void impress_friends(void)
513 /* 517 /*
514 * Allow the user to impress friends. 518 * Allow the user to impress friends.
515 */ 519 */
516 Dprintk("Before bogomips.\n"); 520 pr_debug("Before bogomips.\n");
517 for_each_possible_cpu(cpu) 521 for_each_possible_cpu(cpu)
518 if (cpu_isset(cpu, cpu_callout_map)) 522 if (cpu_isset(cpu, cpu_callout_map))
519 bogosum += cpu_data(cpu).loops_per_jiffy; 523 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +527,7 @@ static void impress_friends(void)
523 bogosum/(500000/HZ), 527 bogosum/(500000/HZ),
524 (bogosum/(5000/HZ))%100); 528 (bogosum/(5000/HZ))%100);
525 529
526 Dprintk("Before bogocount - setting activated=1.\n"); 530 pr_debug("Before bogocount - setting activated=1.\n");
527} 531}
528 532
529static inline void __inquire_remote_apic(int apicid) 533static inline void __inquire_remote_apic(int apicid)
@@ -546,8 +550,8 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 550 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 551 "a previous APIC delivery may have failed\n");
548 552
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 553 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 554 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 555
552 timeout = 0; 556 timeout = 0;
553 do { 557 do {
@@ -579,29 +583,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 583 int maxlvt;
580 584
581 /* Target chip */ 585 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 586 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583 587
584 /* Boot on the stack */ 588 /* Boot on the stack */
585 /* Kick the second */ 589 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 590 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
587 591
588 Dprintk("Waiting for send to finish...\n"); 592 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 593 send_status = safe_apic_wait_icr_idle();
590 594
591 /* 595 /*
592 * Give the other CPU some time to accept the IPI. 596 * Give the other CPU some time to accept the IPI.
593 */ 597 */
594 udelay(200); 598 udelay(200);
595 /*
596 * Due to the Pentium erratum 3AP.
597 */
598 maxlvt = lapic_get_maxlvt(); 599 maxlvt = lapic_get_maxlvt();
599 if (maxlvt > 3) { 600 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0); 601 apic_write(APIC_ESR, 0);
602 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 602 accept_status = (apic_read(APIC_ESR) & 0xEF);
604 Dprintk("NMI sent.\n"); 603 pr_debug("NMI sent.\n");
605 604
606 if (send_status) 605 if (send_status)
607 printk(KERN_ERR "APIC never delivered???\n"); 606 printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +624,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 624 return send_status;
626 } 625 }
627 626
627 maxlvt = lapic_get_maxlvt();
628
628 /* 629 /*
629 * Be paranoid about clearing APIC errors. 630 * Be paranoid about clearing APIC errors.
630 */ 631 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 632 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 633 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 634 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 635 apic_read(APIC_ESR);
635 } 636 }
636 637
637 Dprintk("Asserting INIT.\n"); 638 pr_debug("Asserting INIT.\n");
638 639
639 /* 640 /*
640 * Turn INIT on target chip 641 * Turn INIT on target chip
641 */ 642 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 643 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 644
644 /* 645 /*
645 * Send IPI 646 * Send IPI
646 */ 647 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 648 apic_write(APIC_ICR,
648 | APIC_DM_INIT); 649 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
649 650
650 Dprintk("Waiting for send to finish...\n"); 651 pr_debug("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 652 send_status = safe_apic_wait_icr_idle();
652 653
653 mdelay(10); 654 mdelay(10);
654 655
655 Dprintk("Deasserting INIT.\n"); 656 pr_debug("Deasserting INIT.\n");
656 657
657 /* Target chip */ 658 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 659 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659 660
660 /* Send IPI */ 661 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 662 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
662 663
663 Dprintk("Waiting for send to finish...\n"); 664 pr_debug("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 665 send_status = safe_apic_wait_icr_idle();
665 666
666 mb(); 667 mb();
@@ -687,55 +688,47 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
687 /* 688 /*
688 * Run STARTUP IPI loop. 689 * Run STARTUP IPI loop.
689 */ 690 */
690 Dprintk("#startup loops: %d.\n", num_starts); 691 pr_debug("#startup loops: %d.\n", num_starts);
691
692 maxlvt = lapic_get_maxlvt();
693 692
694 for (j = 1; j <= num_starts; j++) { 693 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 694 pr_debug("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 695 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 696 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 697 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 698 pr_debug("After apic_write.\n");
700 699
701 /* 700 /*
702 * STARTUP IPI 701 * STARTUP IPI
703 */ 702 */
704 703
705 /* Target chip */ 704 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 705 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707 706
708 /* Boot on the stack */ 707 /* Boot on the stack */
709 /* Kick the second */ 708 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 709 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
711 | (start_eip >> 12));
712 710
713 /* 711 /*
714 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
715 */ 713 */
716 udelay(300); 714 udelay(300);
717 715
718 Dprintk("Startup point 1.\n"); 716 pr_debug("Startup point 1.\n");
719 717
720 Dprintk("Waiting for send to finish...\n"); 718 pr_debug("Waiting for send to finish...\n");
721 send_status = safe_apic_wait_icr_idle(); 719 send_status = safe_apic_wait_icr_idle();
722 720
723 /* 721 /*
724 * Give the other CPU some time to accept the IPI. 722 * Give the other CPU some time to accept the IPI.
725 */ 723 */
726 udelay(200); 724 udelay(200);
727 /* 725 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 726 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 727 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 728 if (send_status || accept_status)
736 break; 729 break;
737 } 730 }
738 Dprintk("After Startup.\n"); 731 pr_debug("After Startup.\n");
739 732
740 if (send_status) 733 if (send_status)
741 printk(KERN_ERR "APIC never delivered???\n"); 734 printk(KERN_ERR "APIC never delivered???\n");
@@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
763} 756}
764 757
765#ifdef CONFIG_X86_64 758#ifdef CONFIG_X86_64
759
760/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
761static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
762{
763 if (!after_bootmem)
764 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
765}
766
766/* 767/*
767 * Allocate node local memory for the AP pda. 768 * Allocate node local memory for the AP pda.
768 * 769 *
769 * Must be called after the _cpu_pda pointer table is initialized. 770 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 771 */
771static int __cpuinit get_local_pda(int cpu) 772int __cpuinit get_local_pda(int cpu)
772{ 773{
773 struct x8664_pda *oldpda, *newpda; 774 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 775 unsigned long size = sizeof(struct x8664_pda);
@@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu)
791 792
792 if (oldpda) { 793 if (oldpda) {
793 memcpy(newpda, oldpda, size); 794 memcpy(newpda, oldpda, size);
794 if (!after_bootmem) 795 free_bootmem_pda(oldpda);
795 free_bootmem((unsigned long)oldpda, size);
796 } 796 }
797 797
798 newpda->in_bootmem = 0; 798 newpda->in_bootmem = 0;
@@ -886,7 +886,7 @@ do_rest:
886 886
887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
888 888
889 Dprintk("Setting warm reset code and vector.\n"); 889 pr_debug("Setting warm reset code and vector.\n");
890 890
891 store_NMI_vector(&nmi_high, &nmi_low); 891 store_NMI_vector(&nmi_high, &nmi_low);
892 892
@@ -907,9 +907,9 @@ do_rest:
907 /* 907 /*
908 * allow APs to start initializing. 908 * allow APs to start initializing.
909 */ 909 */
910 Dprintk("Before Callout %d.\n", cpu); 910 pr_debug("Before Callout %d.\n", cpu);
911 cpu_set(cpu, cpu_callout_map); 911 cpu_set(cpu, cpu_callout_map);
912 Dprintk("After Callout %d.\n", cpu); 912 pr_debug("After Callout %d.\n", cpu);
913 913
914 /* 914 /*
915 * Wait 5s total for a response 915 * Wait 5s total for a response
@@ -922,10 +922,10 @@ do_rest:
922 922
923 if (cpu_isset(cpu, cpu_callin_map)) { 923 if (cpu_isset(cpu, cpu_callin_map)) {
924 /* number CPUs logically, starting from 1 (BSP is 0) */ 924 /* number CPUs logically, starting from 1 (BSP is 0) */
925 Dprintk("OK.\n"); 925 pr_debug("OK.\n");
926 printk(KERN_INFO "CPU%d: ", cpu); 926 printk(KERN_INFO "CPU%d: ", cpu);
927 print_cpu_info(&cpu_data(cpu)); 927 print_cpu_info(&cpu_data(cpu));
928 Dprintk("CPU has booted.\n"); 928 pr_debug("CPU has booted.\n");
929 } else { 929 } else {
930 boot_error = 1; 930 boot_error = 1;
931 if (*((volatile unsigned char *)trampoline_base) 931 if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +970,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
970 970
971 WARN_ON(irqs_disabled()); 971 WARN_ON(irqs_disabled());
972 972
973 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 973 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
974 974
975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
976 !physid_isset(apicid, phys_cpu_present_map)) { 976 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +982,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
982 * Already booted CPU? 982 * Already booted CPU?
983 */ 983 */
984 if (cpu_isset(cpu, cpu_callin_map)) { 984 if (cpu_isset(cpu, cpu_callin_map)) {
985 Dprintk("do_boot_cpu %d Already started\n", cpu); 985 pr_debug("do_boot_cpu %d Already started\n", cpu);
986 return -ENOSYS; 986 return -ENOSYS;
987 } 987 }
988 988
@@ -1009,7 +1009,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1009 err = do_boot_cpu(apicid, cpu); 1009 err = do_boot_cpu(apicid, cpu);
1010#endif 1010#endif
1011 if (err) { 1011 if (err) {
1012 Dprintk("do_boot_cpu failed %d\n", err); 1012 pr_debug("do_boot_cpu failed %d\n", err);
1013 return -EIO; 1013 return -EIO;
1014 } 1014 }
1015 1015
@@ -1055,6 +1055,34 @@ static __init void disable_smp(void)
1055static int __init smp_sanity_check(unsigned max_cpus) 1055static int __init smp_sanity_check(unsigned max_cpus)
1056{ 1056{
1057 preempt_disable(); 1057 preempt_disable();
1058
1059#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1060 if (def_to_bigsmp && nr_cpu_ids > 8) {
1061 unsigned int cpu;
1062 unsigned nr;
1063
1064 printk(KERN_WARNING
1065 "More than 8 CPUs detected - skipping them.\n"
1066 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1067
1068 nr = 0;
1069 for_each_present_cpu(cpu) {
1070 if (nr >= 8)
1071 cpu_clear(cpu, cpu_present_map);
1072 nr++;
1073 }
1074
1075 nr = 0;
1076 for_each_possible_cpu(cpu) {
1077 if (nr >= 8)
1078 cpu_clear(cpu, cpu_possible_map);
1079 nr++;
1080 }
1081
1082 nr_cpu_ids = 8;
1083 }
1084#endif
1085
1058 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1086 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1059 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1087 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1060 "by the BIOS.\n", hard_smp_processor_id()); 1088 "by the BIOS.\n", hard_smp_processor_id());
@@ -1193,6 +1221,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1193 printk(KERN_INFO "CPU%d: ", 0); 1221 printk(KERN_INFO "CPU%d: ", 0);
1194 print_cpu_info(&cpu_data(0)); 1222 print_cpu_info(&cpu_data(0));
1195 setup_boot_clock(); 1223 setup_boot_clock();
1224
1225 if (is_uv_system())
1226 uv_system_init();
1196out: 1227out:
1197 preempt_enable(); 1228 preempt_enable();
1198} 1229}
@@ -1213,7 +1244,7 @@ void __init native_smp_prepare_boot_cpu(void)
1213 1244
1214void __init native_smp_cpus_done(unsigned int max_cpus) 1245void __init native_smp_cpus_done(unsigned int max_cpus)
1215{ 1246{
1216 Dprintk("Boot done.\n"); 1247 pr_debug("Boot done.\n");
1217 1248
1218 impress_friends(); 1249 impress_friends();
1219 smp_checks(); 1250 smp_checks();
@@ -1230,7 +1261,7 @@ static void remove_siblinginfo(int cpu)
1230 int sibling; 1261 int sibling;
1231 struct cpuinfo_x86 *c = &cpu_data(cpu); 1262 struct cpuinfo_x86 *c = &cpu_data(cpu);
1232 1263
1233 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { 1264 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1234 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1265 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1235 /*/ 1266 /*/
1236 * last thread sibling in this cpu core going down 1267 * last thread sibling in this cpu core going down
@@ -1239,7 +1270,7 @@ static void remove_siblinginfo(int cpu)
1239 cpu_data(sibling).booted_cores--; 1270 cpu_data(sibling).booted_cores--;
1240 } 1271 }
1241 1272
1242 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) 1273 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1243 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1274 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1244 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1275 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1245 cpus_clear(per_cpu(cpu_core_map, cpu)); 1276 cpus_clear(per_cpu(cpu_core_map, cpu));
@@ -1311,7 +1342,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1311 cpu_clear(cpu, cpu_callout_map); 1342 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1343 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1344 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1345 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1346 numa_remove_cpu(cpu);
1316} 1347}
1317 1348
@@ -1347,7 +1378,9 @@ int __cpu_disable(void)
1347 remove_siblinginfo(cpu); 1378 remove_siblinginfo(cpu);
1348 1379
1349 /* It's now safe to remove this processor from the online map */ 1380 /* It's now safe to remove this processor from the online map */
1381 lock_vector_lock();
1350 remove_cpu_from_maps(cpu); 1382 remove_cpu_from_maps(cpu);
1383 unlock_vector_lock();
1351 fixup_irqs(cpu_online_map); 1384 fixup_irqs(cpu_online_map);
1352 return 0; 1385 return 0;
1353} 1386}
@@ -1381,16 +1414,3 @@ void __cpu_die(unsigned int cpu)
1381 BUG(); 1414 BUG();
1382} 1415}
1383#endif 1416#endif
1384
1385/*
1386 * If the BIOS enumerates physical processors before logical,
1387 * maxcpus=N at enumeration-time can be used to disable HT.
1388 */
1389static int __init parse_maxcpus(char *arg)
1390{
1391 extern unsigned int maxcpus;
1392
1393 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0;
1395}
1396early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..ffe3c664afc0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
129 */ 129 */
130void __init time_init(void) 130void __init time_init(void)
131{ 131{
132 pre_time_init_hook();
132 tsc_init(); 133 tsc_init();
133 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
134} 135}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..8b8c0d6640fa 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -17,6 +17,7 @@
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
20 21
21#include <mach_apic.h> 22#include <mach_apic.h>
22 23
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
783 uv_init_blade(blade, node, cur_cpu); 784 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade); 785 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 } 786 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts(); 788 uv_enable_timeouts();
788 789
789 return 0; 790 return 0;
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8a768973c4f0..03df8e45e5a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
58#include <asm/nmi.h> 58#include <asm/nmi.h>
59#include <asm/smp.h> 59#include <asm/smp.h>
60#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
61 62
62#include "mach_traps.h" 63#include "mach_traps.h"
63 64
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
77gate_desc idt_table[256] 78gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79 80
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi; 81int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
383 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
384} 365}
385 366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
386int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{ 416{
388 unsigned short ss; 417 unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
423 */ 452 */
424void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
425{ 454{
426 static struct { 455 unsigned long flags = oops_begin();
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449 456
450 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
451 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
452 459
453 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 } 464 }
458 465
459 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479} 467}
480 468
481static inline void 469static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2696a6837782..513caaca7115 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/pda.h> 53#include <asm/pda.h>
54#include <asm/traps.h>
54 55
55#include <mach_traps.h> 56#include <mach_traps.h>
56 57
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi; 58int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12; 59int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
355 .address = print_trace_address, 335 .address = print_trace_address,
356}; 336};
357 337
358void show_trace(struct task_struct *task, struct pt_regs *regs, 338static void
359 unsigned long *stack, unsigned long bp) 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
360{ 341{
361 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
363 printk("\n"); 344 printk("\n");
364} 345}
365 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
366static void 353static void
367_show_stack(struct task_struct *task, struct pt_regs *regs, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
369{ 356{
370 unsigned long *stack; 357 unsigned long *stack;
371 int i; 358 int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
399 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
400 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
401 } 388 }
402 show_trace(task, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
403} 390}
404 391
405void show_stack(struct task_struct *task, unsigned long *sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
406{ 393{
407 _show_stack(task, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
408} 395}
409 396
410/* 397/*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
454 u8 *ip; 441 u8 *ip;
455 442
456 printk("Stack: "); 443 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
458 printk("\n"); 446 printk("\n");
459 447
460 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
518} 506}
519 507
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{ 509{
522 die_owner = -1; 510 die_owner = -1;
523 bust_spinlocks(0); 511 bust_spinlocks(0);
524 die_nest_count--; 512 die_nest_count--;
@@ -1143,7 +1131,14 @@ asmlinkage void math_state_restore(void)
1143 } 1131 }
1144 1132
1145 clts(); /* Allow maths ops (or we recurse) */ 1133 clts(); /* Allow maths ops (or we recurse) */
1146 restore_fpu_checking(&me->thread.xstate->fxsave); 1134 /*
1135 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
1136 */
1137 if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
1138 stts();
1139 force_sig(SIGSEGV, me);
1140 return;
1141 }
1147 task_thread_info(me)->status |= TS_USEDFPU; 1142 task_thread_info(me)->status |= TS_USEDFPU;
1148 me->fpu_counter++; 1143 me->fpu_counter++;
1149} 1144}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..8f98e9de1b82 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *pm, u64 *hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -122,80 +122,216 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
122 return ULLONG_MAX; 122 return ULLONG_MAX;
123} 123}
124 124
125/** 125/*
126 * native_calibrate_tsc - calibrate the tsc on boot 126 * Try to calibrate the TSC against the Programmable
127 * Interrupt Timer and return the frequency of the TSC
128 * in kHz.
129 *
130 * Return ULONG_MAX on failure to calibrate.
127 */ 131 */
128unsigned long native_calibrate_tsc(void) 132static unsigned long pit_calibrate_tsc(void)
129{ 133{
130 unsigned long flags; 134 u64 tsc, t1, t2, delta;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; 135 unsigned long tscmin, tscmax;
132 int hpet = is_hpet_enabled(); 136 int pitcnt;
133 unsigned int tsc_khz_val = 0;
134
135 local_irq_save(flags);
136
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
138 137
138 /* Set the Gate high, disable speaker */
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 139 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140 140
141 /*
142 * Setup CTC channel 2* for mode 0, (interrupt on terminal
143 * count mode), binary count. Set the latch register to 50ms
144 * (LSB then MSB) to begin countdown.
145 */
141 outb(0xb0, 0x43); 146 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 147 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 148 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
144 tr1 = get_cycles();
145 while ((inb(0x61) & 0x20) == 0);
146 tr2 = get_cycles();
147 149
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 150 tsc = t1 = t2 = get_cycles();
149 151
150 local_irq_restore(flags); 152 pitcnt = 0;
153 tscmax = 0;
154 tscmin = ULONG_MAX;
155 while ((inb(0x61) & 0x20) == 0) {
156 t2 = get_cycles();
157 delta = t2 - tsc;
158 tsc = t2;
159 if ((unsigned long) delta < tscmin)
160 tscmin = (unsigned int) delta;
161 if ((unsigned long) delta > tscmax)
162 tscmax = (unsigned int) delta;
163 pitcnt++;
164 }
151 165
152 /* 166 /*
153 * Preset the result with the raw and inaccurate PIT 167 * Sanity checks:
154 * calibration value 168 *
169 * If we were not able to read the PIT more than 5000
170 * times, then we have been hit by a massive SMI
171 *
172 * If the maximum is 10 times larger than the minimum,
173 * then we got hit by an SMI as well.
155 */ 174 */
156 delta = (tr2 - tr1); 175 if (pitcnt < 5000 || tscmax > 10 * tscmin)
176 return ULONG_MAX;
177
178 /* Calculate the PIT value */
179 delta = t2 - t1;
157 do_div(delta, 50); 180 do_div(delta, 50);
158 tsc_khz_val = delta; 181 return delta;
182}
183
184
185/**
186 * native_calibrate_tsc - calibrate the tsc on boot
187 */
188unsigned long native_calibrate_tsc(void)
189{
190 u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
191 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
192 unsigned long flags;
193 int hpet = is_hpet_enabled(), i;
194
195 /*
196 * Run 5 calibration loops to get the lowest frequency value
197 * (the best estimate). We use two different calibration modes
198 * here:
199 *
200 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
201 * load a timeout of 50ms. We read the time right after we
202 * started the timer and wait until the PIT count down reaches
203 * zero. In each wait loop iteration we read the TSC and check
204 * the delta to the previous read. We keep track of the min
205 * and max values of that delta. The delta is mostly defined
206 * by the IO time of the PIT access, so we can detect when a
207 * SMI/SMM disturbance happend between the two reads. If the
208 * maximum time is significantly larger than the minimum time,
209 * then we discard the result and have another try.
210 *
211 * 2) Reference counter. If available we use the HPET or the
212 * PMTIMER as a reference to check the sanity of that value.
213 * We use separate TSC readouts and check inside of the
214 * reference read for a SMI/SMM disturbance. We dicard
215 * disturbed values here as well. We do that around the PIT
216 * calibration delay loop as we have to wait for a certain
217 * amount of time anyway.
218 */
219 for (i = 0; i < 5; i++) {
220 unsigned long tsc_pit_khz;
221
222 /*
223 * Read the start value and the reference count of
224 * hpet/pmtimer when available. Then do the PIT
225 * calibration, which will take at least 50ms, and
226 * read the end value.
227 */
228 local_irq_save(flags);
229 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
230 tsc_pit_khz = pit_calibrate_tsc();
231 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
232 local_irq_restore(flags);
233
234 /* Pick the lowest PIT TSC calibration so far */
235 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
236
237 /* hpet or pmtimer available ? */
238 if (!hpet && !pm1 && !pm2)
239 continue;
240
241 /* Check, whether the sampling was disturbed by an SMI */
242 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
243 continue;
244
245 tsc2 = (tsc2 - tsc1) * 1000000LL;
246
247 if (hpet) {
248 if (hpet2 < hpet1)
249 hpet2 += 0x100000000ULL;
250 hpet2 -= hpet1;
251 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
252 do_div(tsc1, 1000000);
253 } else {
254 if (pm2 < pm1)
255 pm2 += (u64)ACPI_PM_OVRRUN;
256 pm2 -= pm1;
257 tsc1 = pm2 * 1000000000LL;
258 do_div(tsc1, PMTMR_TICKS_PER_SEC);
259 }
260
261 do_div(tsc2, tsc1);
262 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
263 }
264
265 /*
266 * Now check the results.
267 */
268 if (tsc_pit_min == ULONG_MAX) {
269 /* PIT gave no useful value */
270 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
271
272 /* We don't have an alternative source, disable TSC */
273 if (!hpet && !pm1 && !pm2) {
274 printk("TSC: No reference (HPET/PMTIMER) available\n");
275 return 0;
276 }
277
278 /* The alternative source failed as well, disable TSC */
279 if (tsc_ref_min == ULONG_MAX) {
280 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
281 "failed due to SMI disturbance.\n");
282 return 0;
283 }
284
285 /* Use the alternative source */
286 printk(KERN_INFO "TSC: using %s reference calibration\n",
287 hpet ? "HPET" : "PMTIMER");
288
289 return tsc_ref_min;
290 }
159 291
160 /* hpet or pmtimer available ? */ 292 /* We don't have an alternative source, use the PIT calibration value */
161 if (!hpet && !pm1 && !pm2) { 293 if (!hpet && !pm1 && !pm2) {
162 printk(KERN_INFO "TSC calibrated against PIT\n"); 294 printk(KERN_INFO "TSC: Using PIT calibration value\n");
163 goto out; 295 return tsc_pit_min;
164 } 296 }
165 297
166 /* Check, whether the sampling was disturbed by an SMI */ 298 /* The alternative source failed, use the PIT calibration value */
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { 299 if (tsc_ref_min == ULONG_MAX) {
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 300 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
169 "using PIT calibration result\n"); 301 "to SMI disturbance. Using PIT calibration\n");
170 goto out; 302 return tsc_pit_min;
171 } 303 }
172 304
173 tsc2 = (tsc2 - tsc1) * 1000000LL; 305 /* Check the reference deviation */
174 306 delta = ((u64) tsc_pit_min) * 100;
175 if (hpet) { 307 do_div(delta, tsc_ref_min);
176 printk(KERN_INFO "TSC calibrated against HPET\n"); 308
177 if (hpet2 < hpet1) 309 /*
178 hpet2 += 0x100000000ULL; 310 * If both calibration results are inside a 5% window, the we
179 hpet2 -= hpet1; 311 * use the lower frequency of those as it is probably the
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); 312 * closest estimate.
181 do_div(tsc1, 1000000); 313 */
182 } else { 314 if (delta >= 95 && delta <= 105) {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); 315 printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
184 if (pm2 < pm1) 316 hpet ? "HPET" : "PMTIMER");
185 pm2 += (u64)ACPI_PM_OVRRUN; 317 printk(KERN_INFO "TSC: using %s calibration value\n",
186 pm2 -= pm1; 318 tsc_pit_min <= tsc_ref_min ? "PIT" :
187 tsc1 = pm2 * 1000000000LL; 319 hpet ? "HPET" : "PMTIMER");
188 do_div(tsc1, PMTMR_TICKS_PER_SEC); 320 return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
189 } 321 }
190 322
191 do_div(tsc2, tsc1); 323 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
192 tsc_khz_val = tsc2; 324 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
193 325
194out: 326 /*
195 return tsc_khz_val; 327 * The calibration values differ too much. In doubt, we use
328 * the PIT value as we know that there are PMTIMERs around
329 * running at double speed.
330 */
331 printk(KERN_INFO "TSC: Using PIT calibration value\n");
332 return tsc_pit_min;
196} 333}
197 334
198
199#ifdef CONFIG_X86_32 335#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */ 336/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void) 337int recalibrate_cpu_khz(void)
@@ -314,7 +450,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
314 mark_tsc_unstable("cpufreq changes"); 450 mark_tsc_unstable("cpufreq changes");
315 } 451 }
316 452
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu); 453 set_cyc2ns_scale(tsc_khz, freq->cpu);
318 454
319 return 0; 455 return 0;
320} 456}
@@ -325,6 +461,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
325 461
326static int __init cpufreq_tsc(void) 462static int __init cpufreq_tsc(void)
327{ 463{
464 if (!cpu_has_tsc)
465 return 0;
466 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
467 return 0;
328 cpufreq_register_notifier(&time_cpufreq_notifier_block, 468 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER); 469 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0; 470 return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..594ef47f0a63 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 73 return visws_board_type >= 0;
74} 74}
75 75
76static int __init visws_time_init_quirk(void) 76static int __init visws_time_init(void)
77{ 77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 79
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 93 return 0;
94} 94}
95 95
96static int __init visws_pre_intr_init_quirk(void) 96static int __init visws_pre_intr_init(void)
97{ 97{
98 init_VISWS_APIC_irqs(); 98 init_VISWS_APIC_irqs();
99 99
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 114
115long long mem_size __initdata = 0; 115long long mem_size __initdata = 0;
116 116
117static char * __init visws_memory_setup_quirk(void) 117static char * __init visws_memory_setup(void)
118{ 118{
119 long long gfx_mem_size = 8 * MB; 119 long long gfx_mem_size = 8 * MB;
120 120
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 177}
178 178
179static int __init visws_get_smp_config_quirk(unsigned int early) 179static int __init visws_get_smp_config(unsigned int early)
180{ 180{
181 /* 181 /*
182 * Prevent MP-table parsing by the generic code: 182 * Prevent MP-table parsing by the generic code:
@@ -184,15 +184,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early)
184 return 1; 184 return 1;
185} 185}
186 186
187extern unsigned int __cpuinitdata maxcpus;
188
189/* 187/*
190 * The Visual Workstation is Intel MP compliant in the hardware 188 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table). 189 * sense, but it doesn't have a BIOS(-configuration table).
192 * No problem for Linux. 190 * No problem for Linux.
193 */ 191 */
194 192
195static void __init MP_processor_info (struct mpc_config_processor *m) 193static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 194{
197 int ver, logical_apicid; 195 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 196 physid_mask_t apic_cpus;
@@ -232,7 +230,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 230 apic_version[m->mpc_apicid] = ver;
233} 231}
234 232
235int __init visws_find_smp_config_quirk(unsigned int reserve) 233static int __init visws_find_smp_config(unsigned int reserve)
236{ 234{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 235 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 236 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -244,8 +242,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
244 ncpus = CO_CPU_MAX; 242 ncpus = CO_CPU_MAX;
245 } 243 }
246 244
247 if (ncpus > maxcpus) 245 if (ncpus > setup_max_cpus)
248 ncpus = maxcpus; 246 ncpus = setup_max_cpus;
249 247
250#ifdef CONFIG_X86_LOCAL_APIC 248#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1; 249 smp_found_config = 1;
@@ -258,7 +256,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 256 return 1;
259} 257}
260 258
261extern int visws_trap_init_quirk(void); 259static int visws_trap_init(void);
260
261static struct x86_quirks visws_x86_quirks __initdata = {
262 .arch_time_init = visws_time_init,
263 .arch_pre_intr_init = visws_pre_intr_init,
264 .arch_memory_setup = visws_memory_setup,
265 .arch_intr_init = NULL,
266 .arch_trap_init = visws_trap_init,
267 .mach_get_smp_config = visws_get_smp_config,
268 .mach_find_smp_config = visws_find_smp_config,
269};
262 270
263void __init visws_early_detect(void) 271void __init visws_early_detect(void)
264{ 272{
@@ -272,16 +280,10 @@ void __init visws_early_detect(void)
272 280
273 /* 281 /*
274 * Install special quirks for timer, interrupt and memory setup: 282 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 283 * Fall back to generic behavior for traps:
284 * Override generic MP-table parsing:
282 */ 285 */
283 arch_intr_init_quirk = NULL; 286 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 287
286 /* 288 /*
287 * Install reboot quirks: 289 * Install reboot quirks:
@@ -294,12 +296,6 @@ void __init visws_early_detect(void)
294 */ 296 */
295 no_broadcast = 0; 297 no_broadcast = 0;
296 298
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 299#ifdef CONFIG_X86_IO_APIC
304 /* 300 /*
305 * Turn off IO-APIC detection and initialization: 301 * Turn off IO-APIC detection and initialization:
@@ -426,7 +422,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 422 co_apic_read(CO_APIC_ID));
427} 423}
428 424
429int __init visws_trap_init_quirk(void) 425static int __init visws_trap_init(void)
430{ 426{
431 lithium_init(); 427 lithium_init();
432 cobalt_init(); 428 cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..6ca515d6db54 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
@@ -906,7 +907,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..af5bdad84604 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -209,3 +209,11 @@ SECTIONS
209 209
210 DWARF_DEBUG 210 DWARF_DEBUG
211} 211}
212
213#ifdef CONFIG_KEXEC
214/* Link time checks */
215#include <asm/kexec.h>
216
217ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
218 "kexec control code size is too big")
219#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3b..ce3251ce5504 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select MMU_NOTIFIER
24 select ANON_INODES 25 select ANON_INODES
25 ---help--- 26 ---help---
26 Support hosting fully virtualized guest machines using hardware 27 Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218db..d0e940bb6f40 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,8 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663f..c0f7872a9124 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,19 +193,16 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
208 }
209 } 206 }
210 207
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -308,6 +305,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
308 create_pit_timer(&ps->pit_timer, val, 0); 305 create_pit_timer(&ps->pit_timer, val, 0);
309 break; 306 break;
310 case 2: 307 case 2:
308 case 3:
311 create_pit_timer(&ps->pit_timer, val, 1); 309 create_pit_timer(&ps->pit_timer, val, 1);
312 break; 310 break;
313 default: 311 default:
@@ -459,7 +457,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 457 mutex_unlock(&pit_state->lock);
460} 458}
461 459
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 460static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
461 int len, int is_write)
463{ 462{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 463 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 464 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +499,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 499 mutex_unlock(&pit_state->lock);
501} 500}
502 501
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 502static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
503 int len, int is_write)
504{ 504{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 505 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 506}
@@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm)
575 } 575 }
576} 576}
577 577
578void __inject_pit_timer_intr(struct kvm *kvm) 578static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 579{
580 mutex_lock(&kvm->lock); 580 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def47..c31164e8aa46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 130{
131 struct kvm_pic *s = opaque; 131 struct kvm_pic *s = opaque;
132 132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 133 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 134 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
135 pic_update_irq(s);
136 }
135} 137}
136 138
137/* 139/*
@@ -346,7 +348,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 348 return s->elcr;
347} 349}
348 350
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 351static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
352 int len, int is_write)
350{ 353{
351 switch (addr) { 354 switch (addr) {
352 case 0x20: 355 case 0x20:
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c0..7ca47cbb48bb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae162..73f43de69f67 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 358 break;
359
359 case APIC_DM_NMI: 360 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 361 kvm_inject_nmi(vcpu);
361 break; 362 break;
362 363
363 case APIC_DM_INIT: 364 case APIC_DM_INIT:
@@ -572,6 +573,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 573{
573 u32 val = 0; 574 u32 val = 0;
574 575
576 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
577
575 if (offset >= LAPIC_MMIO_LENGTH) 578 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 579 return 0;
577 580
@@ -695,6 +698,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 698
696 offset &= 0xff0; 699 offset &= 0xff0;
697 700
701 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
702
698 switch (offset) { 703 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 704 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 705 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 785
781} 786}
782 787
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 788static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
789 int len, int size)
784{ 790{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 791 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 792 int ret = 0;
@@ -939,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 945 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 946 wait_queue_head_t *q = &apic->vcpu->wq;
941 947
942 atomic_inc(&apic->timer.pending); 948 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 950 if (waitqueue_active(q)) {
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 952 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9cee..81858881287e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..3da2508eb22a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
72#ifndef MMU_DEBUG 73#ifndef MMU_DEBUG
@@ -652,6 +653,88 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
652 account_shadowed(kvm, gfn); 653 account_shadowed(kvm, gfn);
653} 654}
654 655
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
657{
658 u64 *spte;
659 int need_tlb_flush = 0;
660
661 while ((spte = rmap_next(kvm, rmapp, NULL))) {
662 BUG_ON(!(*spte & PT_PRESENT_MASK));
663 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
664 rmap_remove(kvm, spte);
665 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
666 need_tlb_flush = 1;
667 }
668 return need_tlb_flush;
669}
670
671static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
672 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
673{
674 int i;
675 int retval = 0;
676
677 /*
678 * If mmap_sem isn't taken, we can look the memslots with only
679 * the mmu_lock by skipping over the slots with userspace_addr == 0.
680 */
681 for (i = 0; i < kvm->nmemslots; i++) {
682 struct kvm_memory_slot *memslot = &kvm->memslots[i];
683 unsigned long start = memslot->userspace_addr;
684 unsigned long end;
685
686 /* mmu_lock protects userspace_addr */
687 if (!start)
688 continue;
689
690 end = start + (memslot->npages << PAGE_SHIFT);
691 if (hva >= start && hva < end) {
692 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
693 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
694 retval |= handler(kvm,
695 &memslot->lpage_info[
696 gfn_offset /
697 KVM_PAGES_PER_HPAGE].rmap_pde);
698 }
699 }
700
701 return retval;
702}
703
704int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
705{
706 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
707}
708
709static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
710{
711 u64 *spte;
712 int young = 0;
713
714 /* always return old for EPT */
715 if (!shadow_accessed_mask)
716 return 0;
717
718 spte = rmap_next(kvm, rmapp, NULL);
719 while (spte) {
720 int _young;
721 u64 _spte = *spte;
722 BUG_ON(!(_spte & PT_PRESENT_MASK));
723 _young = _spte & PT_ACCESSED_MASK;
724 if (_young) {
725 young = 1;
726 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
727 }
728 spte = rmap_next(kvm, rmapp, spte);
729 }
730 return young;
731}
732
733int kvm_age_hva(struct kvm *kvm, unsigned long hva)
734{
735 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
736}
737
655#ifdef MMU_DEBUG 738#ifdef MMU_DEBUG
656static int is_empty_shadow_page(u64 *spt) 739static int is_empty_shadow_page(u64 *spt)
657{ 740{
@@ -776,6 +859,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 859 BUG();
777} 860}
778 861
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp)
864{
865 int i;
866
867 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
868 sp->spt[i] = shadow_trap_nonpresent_pte;
869}
870
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 872{
781 unsigned index; 873 unsigned index;
@@ -841,7 +933,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
841 hlist_add_head(&sp->hash_link, bucket); 933 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 934 if (!metaphysical)
843 rmap_write_protect(vcpu->kvm, gfn); 935 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else
939 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 940 return sp;
846} 941}
847 942
@@ -917,14 +1012,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
917 } 1012 }
918 kvm_mmu_page_unlink_children(kvm, sp); 1013 kvm_mmu_page_unlink_children(kvm, sp);
919 if (!sp->root_count) { 1014 if (!sp->root_count) {
920 if (!sp->role.metaphysical) 1015 if (!sp->role.metaphysical && !sp->role.invalid)
921 unaccount_shadowed(kvm, sp->gfn); 1016 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 1017 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 1018 kvm_mmu_free_page(kvm, sp);
924 } else { 1019 } else {
1020 int invalid = sp->role.invalid;
925 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 1022 sp->role.invalid = 1;
927 kvm_reload_remote_mmus(kvm); 1023 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
928 } 1026 }
929 kvm_mmu_reset_last_pte_updated(kvm); 1027 kvm_mmu_reset_last_pte_updated(kvm);
930} 1028}
@@ -1103,7 +1201,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1103 mark_page_dirty(vcpu->kvm, gfn); 1201 mark_page_dirty(vcpu->kvm, gfn);
1104 1202
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1203 pgprintk("%s: setting spte %llx\n", __func__, spte);
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", 1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", 1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); 1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1207 set_shadow_pte(shadow_pte, spte);
@@ -1122,8 +1220,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1220 else
1123 kvm_release_pfn_clean(pfn); 1221 kvm_release_pfn_clean(pfn);
1124 } 1222 }
1125 if (!ptwrite || !*ptwrite) 1223 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1224 vcpu->arch.last_pte_updated = shadow_pte;
1225 vcpu->arch.last_pte_gfn = gfn;
1226 }
1127} 1227}
1128 1228
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1229static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1171,9 +1271,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1171 return -ENOMEM; 1271 return -ENOMEM;
1172 } 1272 }
1173 1273
1174 table[index] = __pa(new_table->spt) 1274 set_shadow_pte(&table[index],
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1275 __pa(new_table->spt)
1176 | shadow_user_mask | shadow_x_mask; 1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1177 } 1278 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK; 1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1280 }
@@ -1184,6 +1285,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1184 int r; 1285 int r;
1185 int largepage = 0; 1286 int largepage = 0;
1186 pfn_t pfn; 1287 pfn_t pfn;
1288 unsigned long mmu_seq;
1187 1289
1188 down_read(&current->mm->mmap_sem); 1290 down_read(&current->mm->mmap_sem);
1189 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1191,6 +1293,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1191 largepage = 1; 1293 largepage = 1;
1192 } 1294 }
1193 1295
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */
1194 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1298 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1195 up_read(&current->mm->mmap_sem); 1299 up_read(&current->mm->mmap_sem);
1196 1300
@@ -1201,6 +1305,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1201 } 1305 }
1202 1306
1203 spin_lock(&vcpu->kvm->mmu_lock); 1307 spin_lock(&vcpu->kvm->mmu_lock);
1308 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock;
1204 kvm_mmu_free_some_pages(vcpu); 1310 kvm_mmu_free_some_pages(vcpu);
1205 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1206 PT32E_ROOT_LEVEL); 1312 PT32E_ROOT_LEVEL);
@@ -1208,18 +1314,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1208 1314
1209 1315
1210 return r; 1316 return r;
1211}
1212
1213 1317
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1318out_unlock:
1215 struct kvm_mmu_page *sp) 1319 spin_unlock(&vcpu->kvm->mmu_lock);
1216{ 1320 kvm_release_pfn_clean(pfn);
1217 int i; 1321 return 0;
1218
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1220 sp->spt[i] = shadow_trap_nonpresent_pte;
1221} 1322}
1222 1323
1324
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1325static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1326{
1225 int i; 1327 int i;
@@ -1335,6 +1437,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1335 int r; 1437 int r;
1336 int largepage = 0; 1438 int largepage = 0;
1337 gfn_t gfn = gpa >> PAGE_SHIFT; 1439 gfn_t gfn = gpa >> PAGE_SHIFT;
1440 unsigned long mmu_seq;
1338 1441
1339 ASSERT(vcpu); 1442 ASSERT(vcpu);
1340 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1443 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1348,6 +1451,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1348 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1349 largepage = 1; 1452 largepage = 1;
1350 } 1453 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */
1351 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1456 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1352 up_read(&current->mm->mmap_sem); 1457 up_read(&current->mm->mmap_sem);
1353 if (is_error_pfn(pfn)) { 1458 if (is_error_pfn(pfn)) {
@@ -1355,12 +1460,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1355 return 1; 1460 return 1;
1356 } 1461 }
1357 spin_lock(&vcpu->kvm->mmu_lock); 1462 spin_lock(&vcpu->kvm->mmu_lock);
1463 if (mmu_notifier_retry(vcpu, mmu_seq))
1464 goto out_unlock;
1358 kvm_mmu_free_some_pages(vcpu); 1465 kvm_mmu_free_some_pages(vcpu);
1359 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1360 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1361 spin_unlock(&vcpu->kvm->mmu_lock); 1468 spin_unlock(&vcpu->kvm->mmu_lock);
1362 1469
1363 return r; 1470 return r;
1471
1472out_unlock:
1473 spin_unlock(&vcpu->kvm->mmu_lock);
1474 kvm_release_pfn_clean(pfn);
1475 return 0;
1364} 1476}
1365 1477
1366static void nonpaging_free(struct kvm_vcpu *vcpu) 1478static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1660,6 +1772,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1660 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1661 vcpu->arch.update_pte.largepage = 1; 1773 vcpu->arch.update_pte.largepage = 1;
1662 } 1774 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */
1663 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1777 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1664 up_read(&current->mm->mmap_sem); 1778 up_read(&current->mm->mmap_sem);
1665 1779
@@ -1671,6 +1785,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 1785 vcpu->arch.update_pte.pfn = pfn;
1672} 1786}
1673 1787
1788static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1789{
1790 u64 *spte = vcpu->arch.last_pte_updated;
1791
1792 if (spte
1793 && vcpu->arch.last_pte_gfn == gfn
1794 && shadow_accessed_mask
1795 && !(*spte & shadow_accessed_mask)
1796 && is_shadow_present_pte(*spte))
1797 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1798}
1799
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1800void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 1801 const u8 *new, int bytes)
1676{ 1802{
@@ -1694,6 +1820,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 1820 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1821 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 1822 spin_lock(&vcpu->kvm->mmu_lock);
1823 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 1824 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 1825 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 1826 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1791,6 +1918,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1791 spin_unlock(&vcpu->kvm->mmu_lock); 1918 spin_unlock(&vcpu->kvm->mmu_lock);
1792 return r; 1919 return r;
1793} 1920}
1921EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1794 1922
1795void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 1923void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1796{ 1924{
@@ -1847,6 +1975,12 @@ void kvm_enable_tdp(void)
1847} 1975}
1848EXPORT_SYMBOL_GPL(kvm_enable_tdp); 1976EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1849 1977
1978void kvm_disable_tdp(void)
1979{
1980 tdp_enabled = false;
1981}
1982EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1983
1850static void free_mmu_pages(struct kvm_vcpu *vcpu) 1984static void free_mmu_pages(struct kvm_vcpu *vcpu)
1851{ 1985{
1852 struct kvm_mmu_page *sp; 1986 struct kvm_mmu_page *sp;
@@ -1948,7 +2082,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1948 kvm_flush_remote_tlbs(kvm); 2082 kvm_flush_remote_tlbs(kvm);
1949} 2083}
1950 2084
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2085static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 2086{
1953 struct kvm_mmu_page *page; 2087 struct kvm_mmu_page *page;
1954 2088
@@ -1968,6 +2102,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 2102 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 2103 int npages;
1970 2104
2105 if (!down_read_trylock(&kvm->slots_lock))
2106 continue;
1971 spin_lock(&kvm->mmu_lock); 2107 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 2108 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 2109 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2116,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2116 nr_to_scan--;
1981 2117
1982 spin_unlock(&kvm->mmu_lock); 2118 spin_unlock(&kvm->mmu_lock);
2119 up_read(&kvm->slots_lock);
1983 } 2120 }
1984 if (kvm_freed) 2121 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2122 list_move_tail(&kvm_freed->vm_list, &vm_list);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7a..258e5d56298e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b619396..4a814bff21f2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 263 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 264 if (is_error_pfn(pfn))
265 return; 265 return;
266 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
267 return;
266 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -343,7 +345,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 shadow_addr = __pa(shadow_page->spt); 345 shadow_addr = __pa(shadow_page->spt);
344 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
345 | PT_WRITABLE_MASK | PT_USER_MASK; 347 | PT_WRITABLE_MASK | PT_USER_MASK;
346 *shadow_ent = shadow_pte; 348 set_shadow_pte(shadow_ent, shadow_pte);
347 } 349 }
348 350
349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 382 int r;
381 pfn_t pfn; 383 pfn_t pfn;
382 int largepage = 0; 384 int largepage = 0;
385 unsigned long mmu_seq;
383 386
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 388 kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 416 largepage = 1;
414 } 417 }
415 } 418 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem); 422 up_read(&current->mm->mmap_sem);
418 423
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 429 }
425 430
426 spin_lock(&vcpu->kvm->mmu_lock); 431 spin_lock(&vcpu->kvm->mmu_lock);
432 if (mmu_notifier_retry(vcpu, mmu_seq))
433 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 434 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 436 largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447
441 return write_pt; 448 return write_pt;
449
450out_unlock:
451 spin_unlock(&vcpu->kvm->mmu_lock);
452 kvm_release_pfn_clean(pfn);
453 return 0;
442} 454}
443 455
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -460,8 +472,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 472static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 473 struct kvm_mmu_page *sp)
462{ 474{
463 int i, offset = 0, r = 0; 475 int i, j, offset, r;
464 pt_element_t pt; 476 pt_element_t pt[256 / sizeof(pt_element_t)];
477 gpa_t pte_gpa;
465 478
466 if (sp->role.metaphysical 479 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 480 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,19 +482,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 482 return;
470 } 483 }
471 484
472 if (PTTYPE == 32) 485 pte_gpa = gfn_to_gpa(sp->gfn);
486 if (PTTYPE == 32) {
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 487 offset = sp->role.quadrant << PT64_LEVEL_BITS;
488 pte_gpa += offset * sizeof(pt_element_t);
489 }
474 490
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 491 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 492 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 493 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
478 494 for (j = 0; j < ARRAY_SIZE(pt); ++j)
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 495 if (r || is_present_pte(pt[j]))
480 sizeof(pt_element_t)); 496 sp->spt[i+j] = shadow_trap_nonpresent_pte;
481 if (r || is_present_pte(pt)) 497 else
482 sp->spt[i] = shadow_trap_nonpresent_pte; 498 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
483 else
484 sp->spt[i] = shadow_notrap_nonpresent_pte;
485 } 499 }
486} 500}
487 501
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab3..8233b86c778c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -27,6 +27,8 @@
27 27
28#include <asm/desc.h> 28#include <asm/desc.h>
29 29
30#define __ex(x) __kvm_handle_fault_on_reboot(x)
31
30MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32 34
@@ -60,6 +62,7 @@ static int npt = 1;
60module_param(npt, int, S_IRUGO); 62module_param(npt, int, S_IRUGO);
61 63
62static void kvm_reput_irq(struct vcpu_svm *svm); 64static void kvm_reput_irq(struct vcpu_svm *svm);
65static void svm_flush_tlb(struct kvm_vcpu *vcpu);
63 66
64static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 67static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
65{ 68{
@@ -129,17 +132,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 132
130static inline void clgi(void) 133static inline void clgi(void)
131{ 134{
132 asm volatile (SVM_CLGI); 135 asm volatile (__ex(SVM_CLGI));
133} 136}
134 137
135static inline void stgi(void) 138static inline void stgi(void)
136{ 139{
137 asm volatile (SVM_STGI); 140 asm volatile (__ex(SVM_STGI));
138} 141}
139 142
140static inline void invlpga(unsigned long addr, u32 asid) 143static inline void invlpga(unsigned long addr, u32 asid)
141{ 144{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 145 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 146}
144 147
145static inline unsigned long kvm_read_cr2(void) 148static inline unsigned long kvm_read_cr2(void)
@@ -270,19 +273,11 @@ static int has_svm(void)
270 273
271static void svm_hardware_disable(void *garbage) 274static void svm_hardware_disable(void *garbage)
272{ 275{
273 struct svm_cpu_data *svm_data 276 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 277
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 278 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 279 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 280 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 281}
287 282
288static void svm_hardware_enable(void *garbage) 283static void svm_hardware_enable(void *garbage)
@@ -321,6 +316,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 316 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 317}
323 318
319static void svm_cpu_uninit(int cpu)
320{
321 struct svm_cpu_data *svm_data
322 = per_cpu(svm_data, raw_smp_processor_id());
323
324 if (!svm_data)
325 return;
326
327 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
328 __free_page(svm_data->save_area);
329 kfree(svm_data);
330}
331
324static int svm_cpu_init(int cpu) 332static int svm_cpu_init(int cpu)
325{ 333{
326 struct svm_cpu_data *svm_data; 334 struct svm_cpu_data *svm_data;
@@ -446,7 +454,8 @@ static __init int svm_hardware_setup(void)
446 if (npt_enabled) { 454 if (npt_enabled) {
447 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 455 printk(KERN_INFO "kvm: Nested Paging enabled\n");
448 kvm_enable_tdp(); 456 kvm_enable_tdp();
449 } 457 } else
458 kvm_disable_tdp();
450 459
451 return 0; 460 return 0;
452 461
@@ -458,6 +467,11 @@ err:
458 467
459static __exit void svm_hardware_unsetup(void) 468static __exit void svm_hardware_unsetup(void)
460{ 469{
470 int cpu;
471
472 for_each_online_cpu(cpu)
473 svm_cpu_uninit(cpu);
474
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 475 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 476 iopm_base = 0;
463} 477}
@@ -707,10 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
708} 722}
709 723
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu) 724static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{ 725{
716 struct vcpu_svm *svm = to_svm(vcpu); 726 struct vcpu_svm *svm = to_svm(vcpu);
@@ -869,6 +879,10 @@ set:
869static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 879static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
870{ 880{
871 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 881 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
882 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
883
884 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
885 force_new_asid(vcpu);
872 886
873 vcpu->arch.cr4 = cr4; 887 vcpu->arch.cr4 = cr4;
874 if (!npt_enabled) 888 if (!npt_enabled)
@@ -949,7 +963,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 963
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 964static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 965{
952 return to_svm(vcpu)->db_regs[dr]; 966 unsigned long val = to_svm(vcpu)->db_regs[dr];
967 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
968 return val;
953} 969}
954 970
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 971static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -997,13 +1013,35 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
997 struct kvm *kvm = svm->vcpu.kvm; 1013 struct kvm *kvm = svm->vcpu.kvm;
998 u64 fault_address; 1014 u64 fault_address;
999 u32 error_code; 1015 u32 error_code;
1016 bool event_injection = false;
1000 1017
1001 if (!irqchip_in_kernel(kvm) && 1018 if (!irqchip_in_kernel(kvm) &&
1002 is_external_interrupt(exit_int_info)) 1019 is_external_interrupt(exit_int_info)) {
1020 event_injection = true;
1003 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1021 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1022 }
1004 1023
1005 fault_address = svm->vmcb->control.exit_info_2; 1024 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1025 error_code = svm->vmcb->control.exit_info_1;
1026
1027 if (!npt_enabled)
1028 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1029 (u32)fault_address, (u32)(fault_address >> 32),
1030 handler);
1031 else
1032 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1033 (u32)fault_address, (u32)(fault_address >> 32),
1034 handler);
1035 /*
1036 * FIXME: Tis shouldn't be necessary here, but there is a flush
1037 * missing in the MMU code. Until we find this bug, flush the
1038 * complete TLB here on an NPF
1039 */
1040 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu);
1042
1043 if (event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1046}
1009 1047
@@ -1081,6 +1119,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1119 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1120}
1083 1121
1122static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1123{
1124 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1125 return 1;
1126}
1127
1128static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1129{
1130 ++svm->vcpu.stat.irq_exits;
1131 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1132 return 1;
1133}
1134
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1135static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1136{
1086 return 1; 1137 return 1;
@@ -1219,6 +1270,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1270 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1271 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1272 else {
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler);
1275
1222 svm->vmcb->save.rax = data & 0xffffffff; 1276 svm->vmcb->save.rax = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1278 svm->next_rip = svm->vmcb->save.rip + 2;
@@ -1284,16 +1338,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1338 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1339 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1340 case MSR_K7_EVNTSEL3:
1341 case MSR_K7_PERFCTR0:
1342 case MSR_K7_PERFCTR1:
1343 case MSR_K7_PERFCTR2:
1344 case MSR_K7_PERFCTR3:
1287 /* 1345 /*
1288 * only support writing 0 to the performance counters for now 1346 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1347 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1348 * happy
1291 */ 1349 */
1292 if (data != 0) 1350 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1351
1294 break; 1352 break;
1295 default: 1353 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1354 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1355 }
1299 return 0; 1356 return 0;
@@ -1304,6 +1361,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1362 u64 data = (svm->vmcb->save.rax & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler);
1367
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1368 svm->next_rip = svm->vmcb->save.rip + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1369 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1370 kvm_inject_gp(&svm->vcpu, 0);
@@ -1323,6 +1384,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1384static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1385 struct kvm_run *kvm_run)
1325{ 1386{
1387 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1388
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1389 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1390 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1391 /*
@@ -1364,8 +1427,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1427 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1428 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1429 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1430 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1431 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1432 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1433 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1434 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1397,6 +1460,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1460 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1461 u32 exit_code = svm->vmcb->control.exit_code;
1399 1462
1463 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1464 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1465
1400 if (npt_enabled) { 1466 if (npt_enabled) {
1401 int mmu_reload = 0; 1467 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1468 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1536,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1536{
1471 struct vmcb_control_area *control; 1537 struct vmcb_control_area *control;
1472 1538
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540
1473 control = &svm->vmcb->control; 1541 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1542 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1543 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1660,9 +1728,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660 sync_lapic_to_cr8(vcpu); 1728 sync_lapic_to_cr8(vcpu);
1661 1729
1662 save_host_msrs(vcpu); 1730 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1731 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1732 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1733 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1734 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1735 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1736 svm->host_dr7 = read_dr7();
@@ -1716,17 +1784,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1716 /* Enter guest mode */ 1784 /* Enter guest mode */
1717 "push %%rax \n\t" 1785 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t" 1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t" 1787 __ex(SVM_VMLOAD) "\n\t"
1720 SVM_VMRUN "\n\t" 1788 __ex(SVM_VMRUN) "\n\t"
1721 SVM_VMSAVE "\n\t" 1789 __ex(SVM_VMSAVE) "\n\t"
1722 "pop %%rax \n\t" 1790 "pop %%rax \n\t"
1723#else 1791#else
1724 /* Enter guest mode */ 1792 /* Enter guest mode */
1725 "push %%eax \n\t" 1793 "push %%eax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1794 "mov %c[vmcb](%[svm]), %%eax \n\t"
1727 SVM_VMLOAD "\n\t" 1795 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1796 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1797 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1798 "pop %%eax \n\t"
1731#endif 1799#endif
1732 1800
@@ -1795,9 +1863,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1795 write_dr7(svm->host_dr7); 1863 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1864 kvm_write_cr2(svm->host_cr2);
1797 1865
1798 load_fs(fs_selector); 1866 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1867 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1868 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1869 load_host_msrs(vcpu);
1802 1870
1803 reload_tss(vcpu); 1871 reload_tss(vcpu);
@@ -1889,7 +1957,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1957 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1958 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1959 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1960
1894 .set_guest_debug = svm_guest_debug, 1961 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1962 .get_msr = svm_get_msr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10ce6ee4c491..7041cc52b562 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -30,6 +30,8 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33#define __ex(x) __kvm_handle_fault_on_reboot(x)
34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 36MODULE_LICENSE("GPL");
35 37
@@ -53,6 +55,7 @@ struct vmcs {
53 55
54struct vcpu_vmx { 56struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 57 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link;
56 int launched; 59 int launched;
57 u8 fail; 60 u8 fail;
58 u32 idt_vectoring_info; 61 u32 idt_vectoring_info;
@@ -88,9 +91,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 91}
89 92
90static int init_rmode(struct kvm *kvm); 93static int init_rmode(struct kvm *kvm);
94static u64 construct_eptp(unsigned long root_hpa);
91 95
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 96static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 97static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
98static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 99
95static struct page *vmx_io_bitmap_a; 100static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 101static struct page *vmx_io_bitmap_b;
@@ -260,6 +265,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 265 SECONDARY_EXEC_ENABLE_VPID);
261} 266}
262 267
268static inline int cpu_has_virtual_nmis(void)
269{
270 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
271}
272
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 273static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 274{
265 int i; 275 int i;
@@ -278,7 +288,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 288 u64 gva;
279 } operand = { vpid, 0, gva }; 289 } operand = { vpid, 0, gva };
280 290
281 asm volatile (ASM_VMX_INVVPID 291 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 292 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 293 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 294 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +300,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 300 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 301 } operand = {eptp, gpa};
292 302
293 asm volatile (ASM_VMX_INVEPT 303 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 304 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 305 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 306 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +321,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 321 u64 phys_addr = __pa(vmcs);
312 u8 error; 322 u8 error;
313 323
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 324 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 325 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 326 : "cc", "memory");
317 if (error) 327 if (error)
@@ -329,6 +339,9 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 339 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 340 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 341 rdtscll(vmx->vcpu.arch.host_tsc);
342 list_del(&vmx->local_vcpus_link);
343 vmx->vcpu.cpu = -1;
344 vmx->launched = 0;
332} 345}
333 346
334static void vcpu_clear(struct vcpu_vmx *vmx) 347static void vcpu_clear(struct vcpu_vmx *vmx)
@@ -336,7 +349,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
336 if (vmx->vcpu.cpu == -1) 349 if (vmx->vcpu.cpu == -1)
337 return; 350 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 351 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 352}
341 353
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 354static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +390,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 390{
379 unsigned long value; 391 unsigned long value;
380 392
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 393 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 394 : "=a"(value) : "d"(field) : "cc");
383 return value; 395 return value;
384} 396}
@@ -413,7 +425,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 425{
414 u8 error; 426 u8 error;
415 427
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 428 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 429 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 430 if (unlikely(error))
419 vmwrite_error(field, value); 431 vmwrite_error(field, value);
@@ -431,10 +443,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 443
432static void vmcs_write64(unsigned long field, u64 value) 444static void vmcs_write64(unsigned long field, u64 value)
433{ 445{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 446 vmcs_writel(field, value);
447#ifndef CONFIG_X86_64
438 asm volatile (""); 448 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 449 vmcs_writel(field+1, value >> 32);
440#endif 450#endif
@@ -474,7 +484,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 484 struct descriptor_table gdt;
475 struct desc_struct *descs; 485 struct desc_struct *descs;
476 486
477 get_gdt(&gdt); 487 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 488 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 489 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 490 load_TR_desc();
@@ -530,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 540 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 541 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 542 */
533 vmx->host_state.ldt_sel = read_ldt(); 543 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 544 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 545 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 546 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 547 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 548 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 550 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 551 vmx->host_state.fs_reload_needed = 1;
542 } 552 }
543 vmx->host_state.gs_sel = read_gs(); 553 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 554 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 555 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 556 else {
@@ -576,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 586 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 587 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 588 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 589 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 590 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 591 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 592 /*
583 * If we have to reload gs, we must take care to 593 * If we have to reload gs, we must take care to
584 * preserve our gs base. 594 * preserve our gs base.
585 */ 595 */
586 local_irq_save(flags); 596 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 597 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 598#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 599 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 600#endif
@@ -617,13 +627,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 627 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 628 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 629 vpid_sync_vcpu_all(vmx);
630 local_irq_disable();
631 list_add(&vmx->local_vcpus_link,
632 &per_cpu(vcpus_on_cpu, cpu));
633 local_irq_enable();
620 } 634 }
621 635
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 636 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 637 u8 error;
624 638
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 639 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 640 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 641 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 642 : "cc");
629 if (error) 643 if (error)
@@ -640,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 654 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 655 * processors.
642 */ 656 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 657 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 658 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 659 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 660
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 661 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +698,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 698 update_exception_bitmap(vcpu);
685} 699}
686 700
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 701static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 702{
694 return vmcs_readl(GUEST_RFLAGS); 703 return vmcs_readl(GUEST_RFLAGS);
@@ -913,6 +922,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 922 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 923 guest_write_tsc(data);
915 break; 924 break;
925 case MSR_P6_PERFCTR0:
926 case MSR_P6_PERFCTR1:
927 case MSR_P6_EVNTSEL0:
928 case MSR_P6_EVNTSEL1:
929 /*
930 * Just discard all writes to the performance counters; this
931 * should keep both older linux and windows 64-bit guests
932 * happy
933 */
934 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
935
936 break;
916 default: 937 default:
917 vmx_load_host_state(vmx); 938 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 939 msr = find_msr_entry(vmx, msr_index);
@@ -1022,6 +1043,7 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1043 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1044 u64 old;
1024 1045
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
@@ -1032,13 +1054,25 @@ static void hardware_enable(void *garbage)
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1054 MSR_IA32_FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1057 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1059 : "memory", "cc");
1037} 1060}
1038 1061
1062static void vmclear_local_vcpus(void)
1063{
1064 int cpu = raw_smp_processor_id();
1065 struct vcpu_vmx *vmx, *n;
1066
1067 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1068 local_vcpus_link)
1069 __vcpu_clear(vmx);
1070}
1071
1039static void hardware_disable(void *garbage) 1072static void hardware_disable(void *garbage)
1040{ 1073{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1074 vmclear_local_vcpus();
1075 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1076 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1077}
1044 1078
@@ -1072,7 +1106,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1106 u32 _vmentry_control = 0;
1073 1107
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1108 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1109 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1110 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1111 &_pin_based_exec_control) < 0)
1078 return -EIO; 1112 return -EIO;
@@ -1389,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1423static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1424{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1425 vpid_sync_vcpu_all(to_vmx(vcpu));
1426 if (vm_need_ept())
1427 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1428}
1393 1429
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1430static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1456,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1456 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1457 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1458 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1459 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1460 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1461 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1462 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1466,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1466 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1467 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1468 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1469 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1470 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1471 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1472 vcpu->arch.cr0 = cr0;
@@ -1821,7 +1857,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 1857 spin_unlock(&vmx_vpid_lock);
1822} 1858}
1823 1859
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 1860static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 1861{
1826 void *va; 1862 void *va;
1827 1863
@@ -1907,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1943 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1944 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1945 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1946 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1947 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1948 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 1949#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 1950 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 1958
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1959 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 1960
1925 get_idt(&dt); 1961 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1962 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 1963
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1964 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -2114,6 +2150,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2150 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2151}
2116 2152
2153static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158}
2159
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2161{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2162 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2255,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2255 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2298 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2256 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2257 (u32)((u64)cr2 >> 32), handler); 2300 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2258 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2303 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2259 } 2304 }
2260 2305
@@ -2554,8 +2599,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2599 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2600 offset = exit_qualification & 0xffful;
2556 2601
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2602 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2603
2561 if (er != EMULATE_DONE) { 2604 if (er != EMULATE_DONE) {
@@ -2639,6 +2682,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2682 return 1;
2640} 2683}
2641 2684
2685static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2686{
2687 u32 cpu_based_vm_exec_control;
2688
2689 /* clear pending NMI */
2690 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2691 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2692 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2693 ++vcpu->stat.nmi_window_exits;
2694
2695 return 1;
2696}
2697
2642/* 2698/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2699 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2705,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2708 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2709 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2710 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2711 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2736,17 +2793,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2793 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 2794}
2738 2795
2796static void enable_nmi_window(struct kvm_vcpu *vcpu)
2797{
2798 u32 cpu_based_vm_exec_control;
2799
2800 if (!cpu_has_virtual_nmis())
2801 return;
2802
2803 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2804 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2805 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2806}
2807
2808static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
2809{
2810 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2811 return !(guest_intr & (GUEST_INTR_STATE_NMI |
2812 GUEST_INTR_STATE_MOV_SS |
2813 GUEST_INTR_STATE_STI));
2814}
2815
2816static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2817{
2818 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2819 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
2820 GUEST_INTR_STATE_STI)) &&
2821 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
2822}
2823
2824static void enable_intr_window(struct kvm_vcpu *vcpu)
2825{
2826 if (vcpu->arch.nmi_pending)
2827 enable_nmi_window(vcpu);
2828 else if (kvm_cpu_has_interrupt(vcpu))
2829 enable_irq_window(vcpu);
2830}
2831
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2832static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2740{ 2833{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2742 u32 idtv_info_field, intr_info_field; 2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector; 2836 int vector;
2745 2837
2746 update_tpr_threshold(vcpu); 2838 update_tpr_threshold(vcpu);
2747 2839
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
2750 idtv_info_field = vmx->idt_vectoring_info; 2842 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) { 2843 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2844 if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2754,8 +2846,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2754 if (printk_ratelimit()) 2846 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2847 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 } 2848 }
2757 if (has_ext_irq) 2849 enable_intr_window(vcpu);
2758 enable_irq_window(vcpu);
2759 return; 2850 return;
2760 } 2851 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
@@ -2765,30 +2856,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; 2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 2857
2767 vmx_inject_irq(vcpu, vect); 2858 vmx_inject_irq(vcpu, vect);
2768 if (unlikely(has_ext_irq)) 2859 enable_intr_window(vcpu);
2769 enable_irq_window(vcpu);
2770 return; 2860 return;
2771 } 2861 }
2772 2862
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2774 2864
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2865 /*
2866 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted.
2869 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2874 ~GUEST_INTR_STATE_NMI);
2875
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
2877 & ~INTR_INFO_RESVD_BITS_MASK);
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2778 2880
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2883 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2782 if (unlikely(has_ext_irq)) 2884 enable_intr_window(vcpu);
2783 enable_irq_window(vcpu);
2784 return; 2885 return;
2785 } 2886 }
2786 if (!has_ext_irq) 2887 if (cpu_has_virtual_nmis()) {
2888 /*
2889 * SDM 3: 25.7.1.2
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2891 * a guest IRET fault.
2892 */
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
2897 GUEST_INTR_STATE_NMI);
2898 else if (vcpu->arch.nmi_pending) {
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu);
2902 return;
2903 }
2904
2905 }
2906 if (!kvm_cpu_has_interrupt(vcpu))
2787 return; 2907 return;
2788 interrupt_window_open = 2908 if (vmx_irq_enabled(vcpu)) {
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu); 2909 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector); 2910 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector); 2911 kvm_timer_intr_post(vcpu, vector);
@@ -2838,7 +2955,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 "push %%edx; push %%ebp;" 2955 "push %%edx; push %%ebp;"
2839 "push %%ecx \n\t" 2956 "push %%ecx \n\t"
2840#endif 2957#endif
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 2959 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 2960 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 2961 /* Load guest registers. Don't clobber flags. */
@@ -2873,9 +2990,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2873#endif 2990#endif
2874 /* Enter guest mode */ 2991 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 2992 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 2993 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 2994 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 2996 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 2997 /* Save guest registers, load host registers, keep flags */
2881#ifdef CONFIG_X86_64 2998#ifdef CONFIG_X86_64
@@ -2949,7 +3066,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2949 fixup_rmode_irq(vmx); 3066 fixup_rmode_irq(vmx);
2950 3067
2951 vcpu->arch.interrupt_window_open = 3068 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3069 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3070 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3071
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3072 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3073 vmx->launched = 1;
@@ -2957,7 +3075,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3075 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3076
2959 /* We need to handle NMIs before interrupts are enabled */ 3077 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3078 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3079 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3080 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3081 asm("int $2");
2963 } 3082 }
@@ -2968,7 +3087,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3087 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3088
2970 if (vmx->vmcs) { 3089 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 1); 3090 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3091 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3092 vmx->vmcs = NULL;
2974 } 3093 }
@@ -2999,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2999 return ERR_PTR(-ENOMEM); 3118 return ERR_PTR(-ENOMEM);
3000 3119
3001 allocate_vpid(vmx); 3120 allocate_vpid(vmx);
3002 if (id == 0 && vm_need_ept()) {
3003 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3004 VMX_EPT_WRITABLE_MASK |
3005 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3006 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3007 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3008 VMX_EPT_EXECUTABLE_MASK);
3009 kvm_enable_tdp();
3010 }
3011 3121
3012 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3122 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3013 if (err) 3123 if (err)
@@ -3095,7 +3205,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3205 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3206 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3207 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3208
3100 .set_guest_debug = set_guest_debug, 3209 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3210 .guest_debug_pre = kvm_guest_debug_pre,
@@ -3187,8 +3296,16 @@ static int __init vmx_init(void)
3187 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3296 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3188 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3297 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3189 3298
3190 if (cpu_has_vmx_ept()) 3299 if (vm_need_ept()) {
3191 bypass_guest_pf = 0; 3300 bypass_guest_pf = 0;
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3305 VMX_EPT_EXECUTABLE_MASK);
3306 kvm_enable_tdp();
3307 } else
3308 kvm_disable_tdp();
3192 3309
3193 if (bypass_guest_pf) 3310 if (bypass_guest_pf)
3194 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3311 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610dfe..23e8373507ad 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
@@ -360,8 +370,6 @@ enum vmcs_field {
360#define VMX_EPT_READABLE_MASK 0x1ull 370#define VMX_EPT_READABLE_MASK 0x1ull
361#define VMX_EPT_WRITABLE_MASK 0x2ull 371#define VMX_EPT_WRITABLE_MASK 0x2ull
362#define VMX_EPT_EXECUTABLE_MASK 0x4ull 372#define VMX_EPT_EXECUTABLE_MASK 0x4ull
363#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
364#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
365 373
366#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 374#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
367 375
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0faa2546b1cd..0d682fc6aeb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
75 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -173,6 +174,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 175}
175 176
177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
178{
179 vcpu->arch.nmi_pending = 1;
180}
181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
182
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 184{
178 WARN_ON(vcpu->arch.exception.pending); 185 WARN_ON(vcpu->arch.exception.pending);
@@ -604,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 612}
606 613
614static bool msr_mtrr_valid(unsigned msr)
615{
616 switch (msr) {
617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
618 case MSR_MTRRfix64K_00000:
619 case MSR_MTRRfix16K_80000:
620 case MSR_MTRRfix16K_A0000:
621 case MSR_MTRRfix4K_C0000:
622 case MSR_MTRRfix4K_C8000:
623 case MSR_MTRRfix4K_D0000:
624 case MSR_MTRRfix4K_D8000:
625 case MSR_MTRRfix4K_E0000:
626 case MSR_MTRRfix4K_E8000:
627 case MSR_MTRRfix4K_F0000:
628 case MSR_MTRRfix4K_F8000:
629 case MSR_MTRRdefType:
630 case MSR_IA32_CR_PAT:
631 return true;
632 case 0x2f8:
633 return true;
634 }
635 return false;
636}
637
638static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
639{
640 if (!msr_mtrr_valid(msr))
641 return 1;
642
643 vcpu->arch.mtrr[msr - 0x200] = data;
644 return 0;
645}
607 646
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 647int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 648{
@@ -625,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
625 break; 664 break;
626 case MSR_IA32_UCODE_REV: 665 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 666 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 667 break;
668 case 0x200 ... 0x2ff:
669 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 670 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 671 kvm_set_apic_base(vcpu, data);
632 break; 672 break;
@@ -684,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 725}
686 726
727static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
728{
729 if (!msr_mtrr_valid(msr))
730 return 1;
731
732 *pdata = vcpu->arch.mtrr[msr - 0x200];
733 return 0;
734}
735
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 736int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 737{
689 u64 data; 738 u64 data;
@@ -705,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
705 case MSR_IA32_MC0_MISC+16: 754 case MSR_IA32_MC0_MISC+16:
706 case MSR_IA32_UCODE_REV: 755 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 756 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */
709 case 0xfe:
710 case 0x200 ... 0x2ff:
711 data = 0; 757 data = 0;
712 break; 758 break;
759 case MSR_MTRRcap:
760 data = 0x500 | KVM_NR_VAR_MTRR;
761 break;
762 case 0x200 ... 0x2ff:
763 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 764 case 0xcd: /* fsb frequency */
714 data = 3; 765 data = 3;
715 break; 766 break;
@@ -817,41 +868,6 @@ out:
817 return r; 868 return r;
818} 869}
819 870
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 871int kvm_dev_ioctl_check_extension(long ext)
856{ 872{
857 int r; 873 int r;
@@ -867,8 +883,12 @@ int kvm_dev_ioctl_check_extension(long ext)
867 case KVM_CAP_PIT: 883 case KVM_CAP_PIT:
868 case KVM_CAP_NOP_IO_DELAY: 884 case KVM_CAP_NOP_IO_DELAY:
869 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
886 case KVM_CAP_SYNC_MMU:
870 r = 1; 887 r = 1;
871 break; 888 break;
889 case KVM_CAP_COALESCED_MMIO:
890 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
891 break;
872 case KVM_CAP_VAPIC: 892 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 893 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 894 break;
@@ -1476,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1476 goto out; 1496 goto out;
1477 1497
1478 down_write(&kvm->slots_lock); 1498 down_write(&kvm->slots_lock);
1499 spin_lock(&kvm->mmu_lock);
1479 1500
1480 p = &kvm->arch.aliases[alias->slot]; 1501 p = &kvm->arch.aliases[alias->slot];
1481 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1502 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1487,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1487 break; 1508 break;
1488 kvm->arch.naliases = n; 1509 kvm->arch.naliases = n;
1489 1510
1511 spin_unlock(&kvm->mmu_lock);
1490 kvm_mmu_zap_all(kvm); 1512 kvm_mmu_zap_all(kvm);
1491 1513
1492 up_write(&kvm->slots_lock); 1514 up_write(&kvm->slots_lock);
@@ -1781,13 +1803,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1803 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1804 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1805static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1806 gpa_t addr, int len,
1807 int is_write)
1785{ 1808{
1786 struct kvm_io_device *dev; 1809 struct kvm_io_device *dev;
1787 1810
1788 if (vcpu->arch.apic) { 1811 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1812 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1813 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1814 return dev;
1792 } 1815 }
1793 return NULL; 1816 return NULL;
@@ -1795,13 +1818,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1818
1796 1819
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1820static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1821 gpa_t addr, int len,
1822 int is_write)
1799{ 1823{
1800 struct kvm_io_device *dev; 1824 struct kvm_io_device *dev;
1801 1825
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1826 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1827 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1828 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1829 is_write);
1805 return dev; 1830 return dev;
1806} 1831}
1807 1832
@@ -1869,7 +1894,7 @@ mmio:
1869 * Is this MMIO handled locally? 1894 * Is this MMIO handled locally?
1870 */ 1895 */
1871 mutex_lock(&vcpu->kvm->lock); 1896 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1897 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1898 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1899 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1900 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1949,7 @@ mmio:
1924 * Is this MMIO handled locally? 1949 * Is this MMIO handled locally?
1925 */ 1950 */
1926 mutex_lock(&vcpu->kvm->lock); 1951 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1952 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 1953 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1954 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 1955 mutex_unlock(&vcpu->kvm->lock);
@@ -2020,6 +2045,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2020 2045
2021int emulate_clts(struct kvm_vcpu *vcpu) 2046int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2047{
2048 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2049 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2050 return X86EMUL_CONTINUE;
2025} 2051}
@@ -2053,21 +2079,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2079
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2081{
2056 static int reported;
2057 u8 opcodes[4]; 2082 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2083 unsigned long rip = vcpu->arch.rip;
2059 unsigned long rip_linear; 2084 unsigned long rip_linear;
2060 2085
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2086 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2087 return;
2065 2088
2089 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2090
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2091 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2092
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2093 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2094 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2095}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2096EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2097
@@ -2105,27 +2129,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2129 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2130 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2131
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2132 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2133
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2134 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2300,9 +2303,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2303}
2301 2304
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2305static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2306 gpa_t addr, int len,
2307 int is_write)
2304{ 2308{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2309 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2310}
2307 2311
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2312int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2331,11 +2335,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2331 2335
2332 kvm_x86_ops->cache_regs(vcpu); 2336 kvm_x86_ops->cache_regs(vcpu);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2338
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2339 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2340
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2341 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2342 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2343 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2344 complete_pio(vcpu);
@@ -2417,7 +2420,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2420 }
2418 } 2421 }
2419 2422
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2423 pio_dev = vcpu_find_pio_dev(vcpu, port,
2424 vcpu->arch.pio.cur_count,
2425 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2426 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2427 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2428 ret = pio_copy_data(vcpu);
@@ -2600,27 +2605,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2605
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2606unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2607{
2608 unsigned long value;
2609
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2610 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2611 switch (cr) {
2605 case 0: 2612 case 0:
2606 return vcpu->arch.cr0; 2613 value = vcpu->arch.cr0;
2614 break;
2607 case 2: 2615 case 2:
2608 return vcpu->arch.cr2; 2616 value = vcpu->arch.cr2;
2617 break;
2609 case 3: 2618 case 3:
2610 return vcpu->arch.cr3; 2619 value = vcpu->arch.cr3;
2620 break;
2611 case 4: 2621 case 4:
2612 return vcpu->arch.cr4; 2622 value = vcpu->arch.cr4;
2623 break;
2613 case 8: 2624 case 8:
2614 return kvm_get_cr8(vcpu); 2625 value = kvm_get_cr8(vcpu);
2626 break;
2615 default: 2627 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2628 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2629 return 0;
2618 } 2630 }
2631 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2632 (u32)((u64)value >> 32), handler);
2633
2634 return value;
2619} 2635}
2620 2636
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2637void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2638 unsigned long *rflags)
2623{ 2639{
2640 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2641 (u32)((u64)val >> 32), handler);
2642
2624 switch (cr) { 2643 switch (cr) {
2625 case 0: 2644 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2645 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2771,8 +2790,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2790 if (!apic || !apic->vapic_addr)
2772 return; 2791 return;
2773 2792
2793 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2794 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2795 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2796 up_read(&vcpu->kvm->slots_lock);
2776} 2797}
2777 2798
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2928,9 +2949,7 @@ out:
2928 2949
2929 post_kvm_run_save(vcpu, kvm_run); 2950 post_kvm_run_save(vcpu, kvm_run);
2930 2951
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 2952 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 2953
2935 return r; 2954 return r;
2936} 2955}
@@ -2942,15 +2961,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 2961
2943 vcpu_load(vcpu); 2962 vcpu_load(vcpu);
2944 2963
2964 if (vcpu->sigset_active)
2965 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2966
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 2968 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 2969 r = -EAGAIN;
2948 return -EAGAIN; 2970 goto out;
2949 } 2971 }
2950 2972
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 2973 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 2974 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 2975 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -3070,8 +3089,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3089 return 0;
3071} 3090}
3072 3091
3073static void get_segment(struct kvm_vcpu *vcpu, 3092void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3093 struct kvm_segment *var, int seg)
3075{ 3094{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3095 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3096}
@@ -3080,7 +3099,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3099{
3081 struct kvm_segment cs; 3100 struct kvm_segment cs;
3082 3101
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3102 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3103 *db = cs.db;
3085 *l = cs.l; 3104 *l = cs.l;
3086} 3105}
@@ -3094,15 +3113,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3113
3095 vcpu_load(vcpu); 3114 vcpu_load(vcpu);
3096 3115
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3116 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3117 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3118 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3119 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3120 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3121 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3122
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3123 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3124 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3125
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3126 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3127 sregs->idt.limit = dt.limit;
@@ -3154,7 +3173,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3173 return 0;
3155} 3174}
3156 3175
3157static void set_segment(struct kvm_vcpu *vcpu, 3176static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3177 struct kvm_segment *var, int seg)
3159{ 3178{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3179 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3168,6 +3187,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3168 kvm_desct->base |= seg_desc->base2 << 24; 3187 kvm_desct->base |= seg_desc->base2 << 24;
3169 kvm_desct->limit = seg_desc->limit0; 3188 kvm_desct->limit = seg_desc->limit0;
3170 kvm_desct->limit |= seg_desc->limit << 16; 3189 kvm_desct->limit |= seg_desc->limit << 16;
3190 if (seg_desc->g) {
3191 kvm_desct->limit <<= 12;
3192 kvm_desct->limit |= 0xfff;
3193 }
3171 kvm_desct->selector = selector; 3194 kvm_desct->selector = selector;
3172 kvm_desct->type = seg_desc->type; 3195 kvm_desct->type = seg_desc->type;
3173 kvm_desct->present = seg_desc->p; 3196 kvm_desct->present = seg_desc->p;
@@ -3191,7 +3214,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3214 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3215 struct kvm_segment kvm_seg;
3193 3216
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3217 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3218
3196 if (kvm_seg.unusable) 3219 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3220 dtable->limit = 0;
@@ -3207,6 +3230,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3207static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3230static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3208 struct desc_struct *seg_desc) 3231 struct desc_struct *seg_desc)
3209{ 3232{
3233 gpa_t gpa;
3210 struct descriptor_table dtable; 3234 struct descriptor_table dtable;
3211 u16 index = selector >> 3; 3235 u16 index = selector >> 3;
3212 3236
@@ -3216,13 +3240,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3216 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3240 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3217 return 1; 3241 return 1;
3218 } 3242 }
3219 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3243 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3244 gpa += index * 8;
3245 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3220} 3246}
3221 3247
3222/* allowed just for 8 bytes segments */ 3248/* allowed just for 8 bytes segments */
3223static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3249static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3250 struct desc_struct *seg_desc)
3225{ 3251{
3252 gpa_t gpa;
3226 struct descriptor_table dtable; 3253 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3254 u16 index = selector >> 3;
3228 3255
@@ -3230,7 +3257,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3230 3257
3231 if (dtable.limit < index * 8 + 7) 3258 if (dtable.limit < index * 8 + 7)
3232 return 1; 3259 return 1;
3233 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3260 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3261 gpa += index * 8;
3262 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3234} 3263}
3235 3264
3236static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3265static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3242,62 +3271,14 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3242 base_addr |= (seg_desc->base1 << 16); 3271 base_addr |= (seg_desc->base1 << 16);
3243 base_addr |= (seg_desc->base2 << 24); 3272 base_addr |= (seg_desc->base2 << 24);
3244 3273
3245 return base_addr; 3274 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3246}
3247
3248static int load_tss_segment32(struct kvm_vcpu *vcpu,
3249 struct desc_struct *seg_desc,
3250 struct tss_segment_32 *tss)
3251{
3252 u32 base_addr;
3253
3254 base_addr = get_tss_base_addr(vcpu, seg_desc);
3255
3256 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3257 sizeof(struct tss_segment_32));
3258}
3259
3260static int save_tss_segment32(struct kvm_vcpu *vcpu,
3261 struct desc_struct *seg_desc,
3262 struct tss_segment_32 *tss)
3263{
3264 u32 base_addr;
3265
3266 base_addr = get_tss_base_addr(vcpu, seg_desc);
3267
3268 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3269 sizeof(struct tss_segment_32));
3270}
3271
3272static int load_tss_segment16(struct kvm_vcpu *vcpu,
3273 struct desc_struct *seg_desc,
3274 struct tss_segment_16 *tss)
3275{
3276 u32 base_addr;
3277
3278 base_addr = get_tss_base_addr(vcpu, seg_desc);
3279
3280 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3281 sizeof(struct tss_segment_16));
3282}
3283
3284static int save_tss_segment16(struct kvm_vcpu *vcpu,
3285 struct desc_struct *seg_desc,
3286 struct tss_segment_16 *tss)
3287{
3288 u32 base_addr;
3289
3290 base_addr = get_tss_base_addr(vcpu, seg_desc);
3291
3292 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3293 sizeof(struct tss_segment_16));
3294} 3275}
3295 3276
3296static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3277static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3278{
3298 struct kvm_segment kvm_seg; 3279 struct kvm_segment kvm_seg;
3299 3280
3300 get_segment(vcpu, &kvm_seg, seg); 3281 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3282 return kvm_seg.selector;
3302} 3283}
3303 3284
@@ -3313,8 +3294,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3294 return 0;
3314} 3295}
3315 3296
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3317 int type_bits, int seg) 3298 int type_bits, int seg)
3318{ 3299{
3319 struct kvm_segment kvm_seg; 3300 struct kvm_segment kvm_seg;
3320 3301
@@ -3327,7 +3308,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3308 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3309 kvm_seg.unusable = 1;
3329 3310
3330 set_segment(vcpu, &kvm_seg, seg); 3311 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3312 return 0;
3332} 3313}
3333 3314
@@ -3373,25 +3354,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3375 3356
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3358 return 1;
3378 3359
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3360 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3361 return 1;
3381 3362
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3363 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3364 return 1;
3384 3365
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3366 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3367 return 1;
3387 3368
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3369 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3370 return 1;
3390 3371
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3372 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3373 return 1;
3393 3374
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3375 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3376 return 1;
3396 return 0; 3377 return 0;
3397} 3378}
@@ -3432,38 +3413,44 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3434 3415
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3417 return 1;
3437 3418
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3419 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3420 return 1;
3440 3421
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3422 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3423 return 1;
3443 3424
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3425 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3426 return 1;
3446 3427
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3428 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3429 return 1;
3449 return 0; 3430 return 0;
3450} 3431}
3451 3432
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3433static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3434 u32 old_tss_base,
3454 struct desc_struct *nseg_desc) 3435 struct desc_struct *nseg_desc)
3455{ 3436{
3456 struct tss_segment_16 tss_segment_16; 3437 struct tss_segment_16 tss_segment_16;
3457 int ret = 0; 3438 int ret = 0;
3458 3439
3459 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3440 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3441 sizeof tss_segment_16))
3460 goto out; 3442 goto out;
3461 3443
3462 save_state_to_tss16(vcpu, &tss_segment_16); 3444 save_state_to_tss16(vcpu, &tss_segment_16);
3463 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3464 3445
3465 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3446 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3447 sizeof tss_segment_16))
3448 goto out;
3449
3450 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3451 &tss_segment_16, sizeof tss_segment_16))
3466 goto out; 3452 goto out;
3453
3467 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3454 if (load_state_from_tss16(vcpu, &tss_segment_16))
3468 goto out; 3455 goto out;
3469 3456
@@ -3472,21 +3459,27 @@ out:
3472 return ret; 3459 return ret;
3473} 3460}
3474 3461
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3462static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3463 u32 old_tss_base,
3477 struct desc_struct *nseg_desc) 3464 struct desc_struct *nseg_desc)
3478{ 3465{
3479 struct tss_segment_32 tss_segment_32; 3466 struct tss_segment_32 tss_segment_32;
3480 int ret = 0; 3467 int ret = 0;
3481 3468
3482 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3469 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3470 sizeof tss_segment_32))
3483 goto out; 3471 goto out;
3484 3472
3485 save_state_to_tss32(vcpu, &tss_segment_32); 3473 save_state_to_tss32(vcpu, &tss_segment_32);
3486 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3487 3474
3488 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3475 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3476 sizeof tss_segment_32))
3477 goto out;
3478
3479 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3480 &tss_segment_32, sizeof tss_segment_32))
3489 goto out; 3481 goto out;
3482
3490 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3483 if (load_state_from_tss32(vcpu, &tss_segment_32))
3491 goto out; 3484 goto out;
3492 3485
@@ -3501,16 +3494,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3501 struct desc_struct cseg_desc; 3494 struct desc_struct cseg_desc;
3502 struct desc_struct nseg_desc; 3495 struct desc_struct nseg_desc;
3503 int ret = 0; 3496 int ret = 0;
3497 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3498 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3504 3499
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3500 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3506 3501
3502 /* FIXME: Handle errors. Failure to read either TSS or their
3503 * descriptors should generate a pagefault.
3504 */
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3505 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3506 goto out;
3509 3507
3510 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3508 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3511 goto out; 3509 goto out;
3512 3510
3513
3514 if (reason != TASK_SWITCH_IRET) { 3511 if (reason != TASK_SWITCH_IRET) {
3515 int cpl; 3512 int cpl;
3516 3513
@@ -3528,8 +3525,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3528 3525
3529 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3526 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3530 cseg_desc.type &= ~(1 << 1); //clear the B flag 3527 cseg_desc.type &= ~(1 << 1); //clear the B flag
3531 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3528 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3532 &cseg_desc);
3533 } 3529 }
3534 3530
3535 if (reason == TASK_SWITCH_IRET) { 3531 if (reason == TASK_SWITCH_IRET) {
@@ -3541,10 +3537,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3541 kvm_x86_ops->cache_regs(vcpu); 3537 kvm_x86_ops->cache_regs(vcpu);
3542 3538
3543 if (nseg_desc.type & 8) 3539 if (nseg_desc.type & 8)
3544 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3545 &nseg_desc); 3541 &nseg_desc);
3546 else 3542 else
3547 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3543 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3548 &nseg_desc); 3544 &nseg_desc);
3549 3545
3550 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3546 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3561,7 +3557,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3557 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3558 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3559 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3561out:
3566 kvm_x86_ops->decache_regs(vcpu); 3562 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3563 return ret;
@@ -3628,15 +3624,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3628 } 3624 }
3629 } 3625 }
3630 3626
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3628 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3629 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3630 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3631 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3632 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3637 3633
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3640 3636
3641 vcpu_put(vcpu); 3637 vcpu_put(vcpu);
3642 3638
@@ -3751,14 +3747,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3747 * allocate ram with GFP_KERNEL.
3752 */ 3748 */
3753 if (!used_math()) 3749 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3750 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3751
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3752 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3753 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3754 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3755 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3756 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3757 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3758 preempt_enable();
3763 3759
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3760 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3771,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3771 return;
3776 3772
3777 vcpu->guest_fpu_loaded = 1; 3773 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3774 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3775 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3776}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3777EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3778
@@ -3786,8 +3782,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3782 return;
3787 3783
3788 vcpu->guest_fpu_loaded = 0; 3784 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3785 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3786 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3787 ++vcpu->stat.fpu_reload;
3792} 3788}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3789EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -3979,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3979 */ 3975 */
3980 if (!user_alloc) { 3976 if (!user_alloc) {
3981 if (npages && !old.rmap) { 3977 if (npages && !old.rmap) {
3978 unsigned long userspace_addr;
3979
3982 down_write(&current->mm->mmap_sem); 3980 down_write(&current->mm->mmap_sem);
3983 memslot->userspace_addr = do_mmap(NULL, 0, 3981 userspace_addr = do_mmap(NULL, 0,
3984 npages * PAGE_SIZE, 3982 npages * PAGE_SIZE,
3985 PROT_READ | PROT_WRITE, 3983 PROT_READ | PROT_WRITE,
3986 MAP_SHARED | MAP_ANONYMOUS, 3984 MAP_SHARED | MAP_ANONYMOUS,
3987 0); 3985 0);
3988 up_write(&current->mm->mmap_sem); 3986 up_write(&current->mm->mmap_sem);
3989 3987
3990 if (IS_ERR((void *)memslot->userspace_addr)) 3988 if (IS_ERR((void *)userspace_addr))
3991 return PTR_ERR((void *)memslot->userspace_addr); 3989 return PTR_ERR((void *)userspace_addr);
3990
3991 /* set userspace_addr atomically for kvm_hva_to_rmapp */
3992 spin_lock(&kvm->mmu_lock);
3993 memslot->userspace_addr = userspace_addr;
3994 spin_unlock(&kvm->mmu_lock);
3992 } else { 3995 } else {
3993 if (!old.user_alloc && old.rmap) { 3996 if (!old.user_alloc && old.rmap) {
3994 int ret; 3997 int ret;
@@ -4016,6 +4019,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4019 return 0;
4017} 4020}
4018 4021
4022void kvm_arch_flush_shadow(struct kvm *kvm)
4023{
4024 kvm_mmu_zap_all(kvm);
4025}
4026
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4027int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4028{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4029 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890c..f2f90468f8b1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -121,7 +121,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 122 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 123 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 124 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 127 /* 0x70 - 0x77 */
@@ -138,9 +138,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 141 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 142 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 143 /* 0x90 - 0x97 */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 146 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 147 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 148 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +154,8 @@ static u16 opcode_table[256] = {
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 155 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 156 /* 0xB0 - 0xBF */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xC0 - 0xC7 */ 159 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 161 0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +171,8 @@ static u16 opcode_table[256] = {
168 /* 0xE0 - 0xE7 */ 171 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */ 173 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 174 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps,
172 0, 0, 0, 0, 176 0, 0, 0, 0,
173 /* 0xF0 - 0xF7 */ 177 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 178 0, 0, 0, 0,
@@ -215,7 +219,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 219 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 220 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 221 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 222 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 223 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 224 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 225 DstMem | SrcReg | ModRM | BitOp,
@@ -518,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 522 register_address_increment(c, &c->eip, rel);
519} 523}
520 524
525static void set_seg_override(struct decode_cache *c, int seg)
526{
527 c->has_seg_override = true;
528 c->seg_override = seg;
529}
530
531static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
532{
533 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
534 return 0;
535
536 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
537}
538
539static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
540 struct decode_cache *c)
541{
542 if (!c->has_seg_override)
543 return 0;
544
545 return seg_base(ctxt, c->seg_override);
546}
547
548static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
549{
550 return seg_base(ctxt, VCPU_SREG_ES);
551}
552
553static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
554{
555 return seg_base(ctxt, VCPU_SREG_SS);
556}
557
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 558static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 559 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 560 unsigned long linear, u8 *dest)
@@ -660,7 +697,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 697{
661 struct decode_cache *c = &ctxt->decode; 698 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 699 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 700 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 701 int rc = 0;
665 702
666 if (c->rex_prefix) { 703 if (c->rex_prefix) {
@@ -731,47 +768,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 768 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 769 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 770 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 771 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 772 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 773 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 774 } else {
738 /* 32/64-bit ModR/M decode. */ 775 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 776 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 777 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 778 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 779 base_reg |= sib & 7;
745 scale = sib >> 6; 780 scale = sib >> 6;
746 781
747 switch (base_reg) { 782 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 783 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 784 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 785 c->modrm_ea += c->regs[base_reg];
757 } 786 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 787 c->modrm_ea += c->regs[index_reg] << scale;
763 } 788 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 789 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 790 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 791 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 792 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 793 switch (c->modrm_mod) {
776 case 0: 794 case 0:
777 if (c->modrm_rm == 5) 795 if (c->modrm_rm == 5)
@@ -785,22 +803,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 803 break;
786 } 804 }
787 } 805 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 806done:
805 return rc; 807 return rc;
806} 808}
@@ -838,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
838 840
839 memset(c, 0, sizeof(struct decode_cache)); 841 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 842 c->eip = ctxt->vcpu->arch.rip;
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 845
843 switch (mode) { 846 switch (mode) {
@@ -876,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 879 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 880 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 881 break;
882 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 883 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 884 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 885 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 886 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 887 break;
888 case 0x64: /* FS override */ 888 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 889 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 890 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 891 break;
897 case 0x40 ... 0x4f: /* REX */ 892 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 893 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +959,11 @@ done_prefixes:
964 if (rc) 959 if (rc)
965 goto done; 960 goto done;
966 961
967 if (!c->override_base) 962 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 963 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 964
974 if (c->override_base) 965 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 966 c->modrm_ea += seg_override_base(ctxt, c);
976 967
977 if (c->ad_bytes != 8) 968 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 969 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1040,7 @@ done_prefixes:
1049 break; 1040 break;
1050 case DstMem: 1041 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1042 if ((c->d & ModRM) && c->modrm_mod == 3) {
1043 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1044 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1045 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1046 c->dst.ptr = c->modrm_ptr;
@@ -1058,6 +1050,9 @@ done_prefixes:
1058 break; 1050 break;
1059 } 1051 }
1060 1052
1053 if (c->rip_relative)
1054 c->modrm_ea += c->eip;
1055
1061done: 1056done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1057 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1058}
@@ -1070,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1065 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1066 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1067 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1068 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1069 c->regs[VCPU_REGS_RSP]);
1075} 1070}
1076 1071
@@ -1080,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1075 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1076 int rc;
1082 1077
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1078 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1079 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1080 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1081 if (rc != 0)
@@ -1402,11 +1397,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1398 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1399 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1400 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1401 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1402 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1403 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1404 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1405 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1406 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1407 goto done;
@@ -1420,9 +1415,8 @@ special_insn:
1420 goto cannot_emulate; 1415 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1416 c->dst.val = (s32) c->src.val;
1422 break; 1417 break;
1418 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1419 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1420 emulate_push(ctxt);
1427 break; 1421 break;
1428 case 0x6c: /* insb */ 1422 case 0x6c: /* insb */
@@ -1433,7 +1427,7 @@ special_insn:
1433 c->rep_prefix ? 1427 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1428 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1429 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1430 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1431 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1432 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1433 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1443,8 @@ special_insn:
1449 c->rep_prefix ? 1443 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1444 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1445 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1446 register_address(c,
1453 *c->override_base : 1447 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1448 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1449 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1450 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1483,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1483 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1484 break;
1492 case 0x86 ... 0x87: /* xchg */ 1485 case 0x86 ... 0x87: /* xchg */
1486 xchg:
1493 /* Write back the register source. */ 1487 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1488 switch (c->dst.bytes) {
1495 case 1: 1489 case 1:
@@ -1514,14 +1508,60 @@ special_insn:
1514 break; 1508 break;
1515 case 0x88 ... 0x8b: /* mov */ 1509 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1510 goto mov;
1511 case 0x8c: { /* mov r/m, sreg */
1512 struct kvm_segment segreg;
1513
1514 if (c->modrm_reg <= 5)
1515 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1516 else {
1517 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1518 c->modrm);
1519 goto cannot_emulate;
1520 }
1521 c->dst.val = segreg.selector;
1522 break;
1523 }
1517 case 0x8d: /* lea r16/r32, m */ 1524 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1525 c->dst.val = c->modrm_ea;
1519 break; 1526 break;
1527 case 0x8e: { /* mov seg, r/m16 */
1528 uint16_t sel;
1529 int type_bits;
1530 int err;
1531
1532 sel = c->src.val;
1533 if (c->modrm_reg <= 5) {
1534 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1535 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1536 type_bits, c->modrm_reg);
1537 } else {
1538 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1539 c->modrm);
1540 goto cannot_emulate;
1541 }
1542
1543 if (err < 0)
1544 goto cannot_emulate;
1545
1546 c->dst.type = OP_NONE; /* Disable writeback. */
1547 break;
1548 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1549 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1550 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1551 if (rc != 0)
1523 goto done; 1552 goto done;
1524 break; 1553 break;
1554 case 0x90: /* nop / xchg r8,rax */
1555 if (!(c->rex_prefix & 1)) { /* nop */
1556 c->dst.type = OP_NONE;
1557 break;
1558 }
1559 case 0x91 ... 0x97: /* xchg reg,rax */
1560 c->src.type = c->dst.type = OP_REG;
1561 c->src.bytes = c->dst.bytes = c->op_bytes;
1562 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1563 c->src.val = *(c->src.ptr);
1564 goto xchg;
1525 case 0x9c: /* pushf */ 1565 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1566 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1567 emulate_push(ctxt);
@@ -1540,11 +1580,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1580 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1581 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1582 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1583 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1584 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1585 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1586 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1587 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1588 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1589 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1599,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1599 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1601 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1602 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1603 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1604 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1605 &c->src.val,
@@ -1572,7 +1610,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1610 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1611 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1612 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1613 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1614 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1615 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1616 &c->dst.val,
@@ -1596,7 +1634,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1634 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1637 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1639 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1640 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1646,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1646 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1647 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1648 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1649 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1650 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1651 &c->dst.val,
1615 c->dst.bytes, 1652 c->dst.bytes,
@@ -1622,6 +1659,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1659 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1660 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1661 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */
1663 goto mov;
1625 case 0xc0 ... 0xc1: 1664 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1665 emulate_grp2(ctxt);
1627 break; 1666 break;
@@ -1660,13 +1699,39 @@ special_insn:
1660 break; 1699 break;
1661 } 1700 }
1662 case 0xe9: /* jmp rel */ 1701 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1702 goto jmp;
1703 case 0xea: /* jmp far */ {
1704 uint32_t eip;
1705 uint16_t sel;
1706
1707 switch (c->op_bytes) {
1708 case 2:
1709 eip = insn_fetch(u16, 2, c->eip);
1710 break;
1711 case 4:
1712 eip = insn_fetch(u32, 4, c->eip);
1713 break;
1714 default:
1715 DPRINTF("jmp far: Invalid op_bytes\n");
1716 goto cannot_emulate;
1717 }
1718 sel = insn_fetch(u16, 2, c->eip);
1719 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1720 DPRINTF("jmp far: Failed to load CS descriptor\n");
1721 goto cannot_emulate;
1722 }
1723
1724 c->eip = eip;
1725 break;
1726 }
1727 case 0xeb:
1728 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1729 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1730 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1731 break;
1667 case 0xf4: /* hlt */ 1732 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1733 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1734 break;
1670 case 0xf5: /* cmc */ 1735 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1736 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1737 ctxt->eflags ^= EFLG_CF;
@@ -1882,6 +1947,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 1947 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 1948 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 1949 break;
1950 case 0xae: /* clflush */
1951 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 1952 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 1953 /*
1887 * Save real source value, then compare EAX against 1954 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50dad44fb542..d9249a882aa5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -991,7 +991,6 @@ __init void lguest_init(void)
991#ifdef CONFIG_X86_LOCAL_APIC 991#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 992 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 993 pv_apic_ops.apic_write = lguest_apic_write;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read; 994 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 995#endif
997 996
@@ -1015,6 +1014,9 @@ __init void lguest_init(void)
1015 init_pg_tables_start = __pa(pg0); 1014 init_pg_tables_start = __pa(pg0);
1016 init_pg_tables_end = __pa(pg0); 1015 init_pg_tables_end = __pa(pg0);
1017 1016
1017 /* As described in head_32.S, we map the first 128M of memory. */
1018 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1019
1018 /* Load the %fs segment register (the per-cpu segment register) with 1020 /* Load the %fs segment register (the per-cpu segment register) with
1019 * the normal data segment to get through booting. */ 1021 * the normal data segment to get through booting. */
1020 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1022 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dfdf428975c0..f118c110af32 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -52,7 +52,7 @@
52 jnz 100b 52 jnz 100b
53102: 53102:
54 .section .fixup,"ax" 54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */ 55103: addl %ecx,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail 56 jmp copy_user_handle_tail
57 .previous 57 .previous
58 58
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 40e0e309d27e..cb0c112386fb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -32,7 +32,7 @@
32 jnz 100b 32 jnz 100b
33102: 33102:
34 .section .fixup,"ax" 34 .section .fixup,"ax"
35103: addl %r8d,%edx /* ecx is zerorest also */ 35103: addl %ecx,%edx /* ecx is zerorest also */
36 jmp copy_user_handle_tail 36 jmp copy_user_handle_tail
37 .previous 37 .previous
38 38
@@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache)
108 jmp 60f 108 jmp 60f
10950: movl %ecx,%edx 10950: movl %ecx,%edx
11060: sfence 11060: sfence
111 movl %r8d,%ecx
112 jmp copy_user_handle_tail 111 jmp copy_user_handle_tail
113 .previous 112 .previous
114 113
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index d5a2b39f882b..01b868ba82f8 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -30,10 +30,11 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 if (safe) {
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); 33 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
34 err = rv.err; 34 &rv, 1);
35 err = err ? err : rv.err;
35 } else { 36 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); 37 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
37 } 38 }
38 *l = rv.l; 39 *l = rv.l;
39 *h = rv.h; 40 *h = rv.h;
@@ -64,23 +65,24 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
64 rv.l = l; 65 rv.l = l;
65 rv.h = h; 66 rv.h = h;
66 if (safe) { 67 if (safe) {
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); 68 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
68 err = rv.err; 69 &rv, 1);
70 err = err ? err : rv.err;
69 } else { 71 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); 72 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
71 } 73 }
72 74
73 return err; 75 return err;
74} 76}
75 77
76void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 78int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
77{ 79{
78 _wrmsr_on_cpu(cpu, msr_no, l, h, 0); 80 return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
79} 81}
80 82
81void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 83int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
82{ 84{
83 _rdmsr_on_cpu(cpu, msr_no, l, h, 0); 85 return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
84} 86}
85 87
86/* These "safe" variants are slower and should be used when the target MSR 88/* These "safe" variants are slower and should be used when the target MSR
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 48278fa7d3de..3d317836be9e 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,14 +10,6 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/*
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20
21#ifdef CONFIG_HOTPLUG_CPU 13#ifdef CONFIG_HOTPLUG_CPU
22#define DEFAULT_SEND_IPI (1) 14#define DEFAULT_SEND_IPI (1)
23#else 15#else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
37 **/ 29 **/
38void __init pre_intr_init_hook(void) 30void __init pre_intr_init_hook(void)
39{ 31{
40 if (arch_pre_intr_init_quirk) { 32 if (x86_quirks->arch_pre_intr_init) {
41 if (arch_pre_intr_init_quirk()) 33 if (x86_quirks->arch_pre_intr_init())
42 return; 34 return;
43 } 35 }
44 init_ISA_irqs(); 36 init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
64 **/ 56 **/
65void __init intr_init_hook(void) 57void __init intr_init_hook(void)
66{ 58{
67 if (arch_intr_init_quirk) { 59 if (x86_quirks->arch_intr_init) {
68 if (arch_intr_init_quirk()) 60 if (x86_quirks->arch_intr_init())
69 return; 61 return;
70 } 62 }
71#ifdef CONFIG_X86_LOCAL_APIC 63#ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
97 **/ 89 **/
98void __init trap_init_hook(void) 90void __init trap_init_hook(void)
99{ 91{
100 if (arch_trap_init_quirk) { 92 if (x86_quirks->arch_trap_init) {
101 if (arch_trap_init_quirk()) 93 if (x86_quirks->arch_trap_init())
102 return; 94 return;
103 } 95 }
104} 96}
@@ -111,6 +103,16 @@ static struct irqaction irq0 = {
111}; 103};
112 104
113/** 105/**
106 * pre_time_init_hook - do any specific initialisations before.
107 *
108 **/
109void __init pre_time_init_hook(void)
110{
111 if (x86_quirks->arch_pre_time_init)
112 x86_quirks->arch_pre_time_init();
113}
114
115/**
114 * time_init_hook - do any specific initialisations for the system timer. 116 * time_init_hook - do any specific initialisations for the system timer.
115 * 117 *
116 * Description: 118 * Description:
@@ -119,13 +121,13 @@ static struct irqaction irq0 = {
119 **/ 121 **/
120void __init time_init_hook(void) 122void __init time_init_hook(void)
121{ 123{
122 if (arch_time_init_quirk) { 124 if (x86_quirks->arch_time_init) {
123 /* 125 /*
124 * A nonzero return code does not mean failure, it means 126 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any 127 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this: 128 * generic (timer) setup to be performed after this:
127 */ 129 */
128 if (arch_time_init_quirk()) 130 if (x86_quirks->arch_time_init())
129 return; 131 return;
130 } 132 }
131 133
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index 4354ce804889..50189af14b85 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -130,10 +130,10 @@ parse_unisys_oem (char *oemptr)
130 mip_addr = val; 130 mip_addr = val;
131 mip = (struct mip_reg *)val; 131 mip = (struct mip_reg *)val;
132 mip_reg = __va(mip); 132 mip_reg = __va(mip);
133 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 133 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
134 (unsigned long)host_reg); 134 (unsigned long)host_reg);
135 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 135 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
136 (unsigned long)mip_reg); 136 (unsigned long)mip_reg);
137 success++; 137 success++;
138 break; 138 break;
139 case MIP_PSAI_REG: 139 case MIP_PSAI_REG:
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
index a037041817c7..4f4e50c3ad3b 100644
--- a/arch/x86/mach-rdc321x/platform.c
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -25,7 +25,6 @@
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/device.h> 26#include <linux/device.h>
27#include <linux/platform_device.h> 27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h> 28#include <linux/leds.h>
30 29
31#include <asm/gpio.h> 30#include <asm/gpio.h>
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9873716e9f76..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 23
24obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 5dfef9fa061a..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -42,7 +42,6 @@
42 42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
45static bootmem_data_t node0_bdata;
46 45
47/* 46/*
48 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
385 for_each_online_node(nid) 384 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387 386
388 NODE_DATA(0)->bdata = &node0_bdata; 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
389 setup_bootmem_allocator(); 388 setup_bootmem_allocator();
390} 389}
391 390
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0bb0caed8971..a20d1fa64b4e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..60ec1d08ff24 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -458,11 +458,7 @@ static void __init pagetable_init(void)
458{ 458{
459 pgd_t *pgd_base = swapper_pg_dir; 459 pgd_t *pgd_base = swapper_pg_dir;
460 460
461 paravirt_pagetable_setup_start(pgd_base);
462
463 permanent_kmaps_init(pgd_base); 461 permanent_kmaps_init(pgd_base);
464
465 paravirt_pagetable_setup_done(pgd_base);
466} 462}
467 463
468#ifdef CONFIG_ACPI_SLEEP 464#ifdef CONFIG_ACPI_SLEEP
@@ -844,6 +840,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
844 reserve_early(table_start << PAGE_SHIFT, 840 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE"); 841 table_end << PAGE_SHIFT, "PGTABLE");
846 842
843 if (!after_init_bootmem)
844 early_memtest(start, end);
845
847 return end >> PAGE_SHIFT; 846 return end >> PAGE_SHIFT;
848} 847}
849 848
@@ -868,8 +867,6 @@ void __init paging_init(void)
868 */ 867 */
869 sparse_init(); 868 sparse_init();
870 zone_sizes_init(); 869 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 870}
874 871
875/* 872/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 306049edd553..d3746efb060d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata;
60 60
61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 62
63int direct_gbpages __meminitdata 63int direct_gbpages
64#ifdef CONFIG_DIRECT_GBPAGES 64#ifdef CONFIG_DIRECT_GBPAGES
65 = 1 65 = 1
66#endif 66#endif
@@ -86,46 +86,13 @@ early_param("gbpages", parse_direct_gbpages_on);
86 * around without checking the pgd every time. 86 * around without checking the pgd every time.
87 */ 87 */
88 88
89void show_mem(void)
90{
91 long i, total = 0, reserved = 0;
92 long shared = 0, cached = 0;
93 struct page *page;
94 pg_data_t *pgdat;
95
96 printk(KERN_INFO "Mem-info:\n");
97 show_free_areas();
98 for_each_online_pgdat(pgdat) {
99 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
100 /*
101 * This loop can take a while with 256 GB and
102 * 4k pages so defer the NMI watchdog:
103 */
104 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
105 touch_nmi_watchdog();
106
107 if (!pfn_valid(pgdat->node_start_pfn + i))
108 continue;
109
110 page = pfn_to_page(pgdat->node_start_pfn + i);
111 total++;
112 if (PageReserved(page))
113 reserved++;
114 else if (PageSwapCache(page))
115 cached++;
116 else if (page_count(page))
117 shared += page_count(page) - 1;
118 }
119 }
120 printk(KERN_INFO "%lu pages of RAM\n", total);
121 printk(KERN_INFO "%lu reserved pages\n", reserved);
122 printk(KERN_INFO "%lu pages shared\n", shared);
123 printk(KERN_INFO "%lu pages swap cached\n", cached);
124}
125
126int after_bootmem; 89int after_bootmem;
127 90
128static __init void *spp_getpage(void) 91/*
92 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
94 */
95static __ref void *spp_getpage(void)
129{ 96{
130 void *ptr; 97 void *ptr;
131 98
@@ -274,7 +241,7 @@ static unsigned long __initdata table_start;
274static unsigned long __meminitdata table_end; 241static unsigned long __meminitdata table_end;
275static unsigned long __meminitdata table_top; 242static unsigned long __meminitdata table_top;
276 243
277static __meminit void *alloc_low_page(unsigned long *phys) 244static __ref void *alloc_low_page(unsigned long *phys)
278{ 245{
279 unsigned long pfn = table_end++; 246 unsigned long pfn = table_end++;
280 void *adr; 247 void *adr;
@@ -295,7 +262,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
295 return adr; 262 return adr;
296} 263}
297 264
298static __meminit void unmap_low_page(void *adr) 265static __ref void unmap_low_page(void *adr)
299{ 266{
300 if (after_bootmem) 267 if (after_bootmem)
301 return; 268 return;
@@ -351,6 +318,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
351{ 318{
352 unsigned long pages = 0; 319 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 320 unsigned long last_map_addr = end;
321 unsigned long start = address;
354 322
355 int i = pmd_index(address); 323 int i = pmd_index(address);
356 324
@@ -368,16 +336,24 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
368 } 336 }
369 337
370 if (pmd_val(*pmd)) { 338 if (pmd_val(*pmd)) {
371 if (!pmd_large(*pmd)) 339 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock);
372 last_map_addr = phys_pte_update(pmd, address, 341 last_map_addr = phys_pte_update(pmd, address,
373 end); 342 end);
343 spin_unlock(&init_mm.page_table_lock);
344 }
345 /* Count entries we're using from level2_ident_pgt */
346 if (start == 0)
347 pages++;
374 continue; 348 continue;
375 } 349 }
376 350
377 if (page_size_mask & (1<<PG_LEVEL_2M)) { 351 if (page_size_mask & (1<<PG_LEVEL_2M)) {
378 pages++; 352 pages++;
353 spin_lock(&init_mm.page_table_lock);
379 set_pte((pte_t *)pmd, 354 set_pte((pte_t *)pmd,
380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
356 spin_unlock(&init_mm.page_table_lock);
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 357 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
382 continue; 358 continue;
383 } 359 }
@@ -386,7 +362,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
386 last_map_addr = phys_pte_init(pte, address, end); 362 last_map_addr = phys_pte_init(pte, address, end);
387 unmap_low_page(pte); 363 unmap_low_page(pte);
388 364
365 spin_lock(&init_mm.page_table_lock);
389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 366 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
367 spin_unlock(&init_mm.page_table_lock);
390 } 368 }
391 update_page_count(PG_LEVEL_2M, pages); 369 update_page_count(PG_LEVEL_2M, pages);
392 return last_map_addr; 370 return last_map_addr;
@@ -399,9 +377,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
399 pmd_t *pmd = pmd_offset(pud, 0); 377 pmd_t *pmd = pmd_offset(pud, 0);
400 unsigned long last_map_addr; 378 unsigned long last_map_addr;
401 379
402 spin_lock(&init_mm.page_table_lock);
403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
404 spin_unlock(&init_mm.page_table_lock);
405 __flush_tlb_all(); 381 __flush_tlb_all();
406 return last_map_addr; 382 return last_map_addr;
407} 383}
@@ -437,20 +413,21 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
437 413
438 if (page_size_mask & (1<<PG_LEVEL_1G)) { 414 if (page_size_mask & (1<<PG_LEVEL_1G)) {
439 pages++; 415 pages++;
416 spin_lock(&init_mm.page_table_lock);
440 set_pte((pte_t *)pud, 417 set_pte((pte_t *)pud,
441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 418 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
419 spin_unlock(&init_mm.page_table_lock);
442 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 420 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
443 continue; 421 continue;
444 } 422 }
445 423
446 pmd = alloc_low_page(&pmd_phys); 424 pmd = alloc_low_page(&pmd_phys);
447
448 spin_lock(&init_mm.page_table_lock);
449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
450 unmap_low_page(pmd); 426 unmap_low_page(pmd);
427
428 spin_lock(&init_mm.page_table_lock);
451 pud_populate(&init_mm, pud, __va(pmd_phys)); 429 pud_populate(&init_mm, pud, __va(pmd_phys));
452 spin_unlock(&init_mm.page_table_lock); 430 spin_unlock(&init_mm.page_table_lock);
453
454 } 431 }
455 __flush_tlb_all(); 432 __flush_tlb_all();
456 update_page_count(PG_LEVEL_1G, pages); 433 update_page_count(PG_LEVEL_1G, pages);
@@ -517,118 +494,6 @@ static void __init init_gbpages(void)
517 direct_gbpages = 0; 494 direct_gbpages = 0;
518} 495}
519 496
520#ifdef CONFIG_MEMTEST
521
522static void __init memtest(unsigned long start_phys, unsigned long size,
523 unsigned pattern)
524{
525 unsigned long i;
526 unsigned long *start;
527 unsigned long start_bad;
528 unsigned long last_bad;
529 unsigned long val;
530 unsigned long start_phys_aligned;
531 unsigned long count;
532 unsigned long incr;
533
534 switch (pattern) {
535 case 0:
536 val = 0UL;
537 break;
538 case 1:
539 val = -1UL;
540 break;
541 case 2:
542 val = 0x5555555555555555UL;
543 break;
544 case 3:
545 val = 0xaaaaaaaaaaaaaaaaUL;
546 break;
547 default:
548 return;
549 }
550
551 incr = sizeof(unsigned long);
552 start_phys_aligned = ALIGN(start_phys, incr);
553 count = (size - (start_phys_aligned - start_phys))/incr;
554 start = __va(start_phys_aligned);
555 start_bad = 0;
556 last_bad = 0;
557
558 for (i = 0; i < count; i++)
559 start[i] = val;
560 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
561 if (*start != val) {
562 if (start_phys_aligned == last_bad + incr) {
563 last_bad += incr;
564 } else {
565 if (start_bad) {
566 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
567 val, start_bad, last_bad + incr);
568 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
569 }
570 start_bad = last_bad = start_phys_aligned;
571 }
572 }
573 }
574 if (start_bad) {
575 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
576 val, start_bad, last_bad + incr);
577 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
578 }
579
580}
581
582/* default is disabled */
583static int memtest_pattern __initdata;
584
585static int __init parse_memtest(char *arg)
586{
587 if (arg)
588 memtest_pattern = simple_strtoul(arg, NULL, 0);
589 return 0;
590}
591
592early_param("memtest", parse_memtest);
593
594static void __init early_memtest(unsigned long start, unsigned long end)
595{
596 u64 t_start, t_size;
597 unsigned pattern;
598
599 if (!memtest_pattern)
600 return;
601
602 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
603 for (pattern = 0; pattern < memtest_pattern; pattern++) {
604 t_start = start;
605 t_size = 0;
606 while (t_start < end) {
607 t_start = find_e820_area_size(t_start, &t_size, 1);
608
609 /* done ? */
610 if (t_start >= end)
611 break;
612 if (t_start + t_size > end)
613 t_size = end - t_start;
614
615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
618
619 memtest(t_start, t_size, pattern);
620
621 t_start += t_size;
622 }
623 }
624 printk(KERN_CONT "\n");
625}
626#else
627static void __init early_memtest(unsigned long start, unsigned long end)
628{
629}
630#endif
631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start, 497static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end, 498 unsigned long end,
634 unsigned long page_size_mask) 499 unsigned long page_size_mask)
@@ -654,16 +519,14 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
654 continue; 519 continue;
655 } 520 }
656 521
657 if (after_bootmem) 522 pud = alloc_low_page(&pud_phys);
658 pud = pud_offset(pgd, start & PGDIR_MASK);
659 else
660 pud = alloc_low_page(&pud_phys);
661
662 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 523 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
663 page_size_mask); 524 page_size_mask);
664 unmap_low_page(pud); 525 unmap_low_page(pud);
665 pgd_populate(&init_mm, pgd_offset_k(start), 526
666 __va(pud_phys)); 527 spin_lock(&init_mm.page_table_lock);
528 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock);
667 } 530 }
668 531
669 return last_map_addr; 532 return last_map_addr;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 24c1d3c30186..d4b6e6a29ae3 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 phys_addr &= PAGE_MASK; 170 phys_addr &= PAGE_MASK;
171 size = PAGE_ALIGN(last_addr+1) - phys_addr; 171 size = PAGE_ALIGN(last_addr+1) - phys_addr;
172 172
173 retval = reserve_memtype(phys_addr, phys_addr + size, 173 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
174 prot_val, &new_prot_val); 174 prot_val, &new_prot_val);
175 if (retval) { 175 if (retval) {
176 pr_debug("Warning: reserve_memtype returned %d\n", retval); 176 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
330 return (void __iomem *)ret; 330 return (void __iomem *)ret;
331} 331}
332 332
333void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
334 unsigned long prot_val)
335{
336 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
337 __builtin_return_address(0));
338}
339EXPORT_SYMBOL(ioremap_prot);
340
333/** 341/**
334 * iounmap - Free a IO remapping 342 * iounmap - Free a IO remapping
335 * @addr: virtual address from ioremap_* 343 * @addr: virtual address from ioremap_*
@@ -545,13 +553,11 @@ static int __init check_early_ioremap_leak(void)
545{ 553{
546 if (!early_ioremap_nested) 554 if (!early_ioremap_nested)
547 return 0; 555 return 0;
548 556 WARN(1, KERN_WARNING
549 printk(KERN_WARNING
550 "Debug warning: early ioremap leak of %d areas detected.\n", 557 "Debug warning: early ioremap leak of %d areas detected.\n",
551 early_ioremap_nested); 558 early_ioremap_nested);
552 printk(KERN_WARNING 559 printk(KERN_WARNING
553 "please boot with early_ioremap_debug and report the dmesg.\n"); 560 "please boot with early_ioremap_debug and report the dmesg.\n");
554 WARN_ON(1);
555 561
556 return 1; 562 return 1;
557} 563}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index e7397e108beb..635b50e85581 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -430,7 +430,9 @@ static void enter_uniprocessor(void)
430 "may miss events.\n"); 430 "may miss events.\n");
431} 431}
432 432
433static void leave_uniprocessor(void) 433/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
434 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
435static void __ref leave_uniprocessor(void)
434{ 436{
435 int cpu; 437 int cpu;
436 int err; 438 int err;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index b432d5781773..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,15 +20,9 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -202,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
202 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
203 197
204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
206 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
208 202
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e6..d4aa503caaa2 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -221,8 +221,7 @@ static int pageattr_test(void)
221 failed += print_split(&sc); 221 failed += print_split(&sc);
222 222
223 if (failed) { 223 if (failed) {
224 printk(KERN_ERR "NOT PASSED. Please report.\n"); 224 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
225 WARN_ON(1);
226 return -EINVAL; 225 return -EINVAL;
227 } else { 226 } else {
228 if (print) 227 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf059..43e2f8483e4f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -55,13 +55,19 @@ static void split_page_count(int level)
55 55
56int arch_report_meminfo(char *page) 56int arch_report_meminfo(char *page)
57{ 57{
58 int n = sprintf(page, "DirectMap4k: %8lu\n" 58 int n = sprintf(page, "DirectMap4k: %8lu kB\n",
59 "DirectMap2M: %8lu\n", 59 direct_pages_count[PG_LEVEL_4K] << 2);
60 direct_pages_count[PG_LEVEL_4K], 60#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 direct_pages_count[PG_LEVEL_2M]); 61 n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
62 direct_pages_count[PG_LEVEL_2M] << 11);
63#else
64 n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
65 direct_pages_count[PG_LEVEL_2M] << 12);
66#endif
62#ifdef CONFIG_X86_64 67#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n", 68 if (direct_gbpages)
64 direct_pages_count[PG_LEVEL_1G]); 69 n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
70 direct_pages_count[PG_LEVEL_1G] << 20);
65#endif 71#endif
66 return n; 72 return n;
67} 73}
@@ -592,10 +598,9 @@ repeat:
592 if (!pte_val(old_pte)) { 598 if (!pte_val(old_pte)) {
593 if (!primary) 599 if (!primary)
594 return 0; 600 return 0;
595 printk(KERN_WARNING "CPA: called for zero pte. " 601 WARN(1, KERN_WARNING "CPA: called for zero pte. "
596 "vaddr = %lx cpa->vaddr = %lx\n", address, 602 "vaddr = %lx cpa->vaddr = %lx\n", address,
597 cpa->vaddr); 603 cpa->vaddr);
598 WARN_ON(1);
599 return -EINVAL; 604 return -EINVAL;
600 } 605 }
601 606
@@ -844,7 +849,7 @@ int set_memory_uc(unsigned long addr, int numpages)
844 /* 849 /*
845 * for now UC MINUS. see comments in ioremap_nocache() 850 * for now UC MINUS. see comments in ioremap_nocache()
846 */ 851 */
847 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 852 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
848 _PAGE_CACHE_UC_MINUS, NULL)) 853 _PAGE_CACHE_UC_MINUS, NULL))
849 return -EINVAL; 854 return -EINVAL;
850 855
@@ -863,7 +868,7 @@ int set_memory_wc(unsigned long addr, int numpages)
863 if (!pat_enabled) 868 if (!pat_enabled)
864 return set_memory_uc(addr, numpages); 869 return set_memory_uc(addr, numpages);
865 870
866 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 871 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
867 _PAGE_CACHE_WC, NULL)) 872 _PAGE_CACHE_WC, NULL))
868 return -EINVAL; 873 return -EINVAL;
869 874
@@ -879,7 +884,7 @@ int _set_memory_wb(unsigned long addr, int numpages)
879 884
880int set_memory_wb(unsigned long addr, int numpages) 885int set_memory_wb(unsigned long addr, int numpages)
881{ 886{
882 free_memtype(addr, addr + numpages * PAGE_SIZE); 887 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
883 888
884 return _set_memory_wb(addr, numpages); 889 return _set_memory_wb(addr, numpages);
885} 890}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d4585077977a..2a50e0fa64a5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -205,6 +207,9 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
205 return -EBUSY; 207 return -EBUSY;
206} 208}
207 209
210static struct memtype *cached_entry;
211static u64 cached_start;
212
208/* 213/*
209 * req_type typically has one of the: 214 * req_type typically has one of the:
210 * - _PAGE_CACHE_WB 215 * - _PAGE_CACHE_WB
@@ -278,11 +283,17 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
278 283
279 spin_lock(&memtype_lock); 284 spin_lock(&memtype_lock);
280 285
286 if (cached_entry && start >= cached_start)
287 entry = cached_entry;
288 else
289 entry = list_entry(&memtype_list, struct memtype, nd);
290
281 /* Search for existing mapping that overlaps the current range */ 291 /* Search for existing mapping that overlaps the current range */
282 where = NULL; 292 where = NULL;
283 list_for_each_entry(entry, &memtype_list, nd) { 293 list_for_each_entry_continue(entry, &memtype_list, nd) {
284 if (end <= entry->start) { 294 if (end <= entry->start) {
285 where = entry->nd.prev; 295 where = entry->nd.prev;
296 cached_entry = list_entry(where, struct memtype, nd);
286 break; 297 break;
287 } else if (start <= entry->start) { /* end > entry->start */ 298 } else if (start <= entry->start) { /* end > entry->start */
288 err = chk_conflict(new, entry, new_type); 299 err = chk_conflict(new, entry, new_type);
@@ -290,6 +301,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
290 dprintk("Overlap at 0x%Lx-0x%Lx\n", 301 dprintk("Overlap at 0x%Lx-0x%Lx\n",
291 entry->start, entry->end); 302 entry->start, entry->end);
292 where = entry->nd.prev; 303 where = entry->nd.prev;
304 cached_entry = list_entry(where,
305 struct memtype, nd);
293 } 306 }
294 break; 307 break;
295 } else if (start < entry->end) { /* start > entry->start */ 308 } else if (start < entry->end) { /* start > entry->start */
@@ -297,7 +310,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
297 if (!err) { 310 if (!err) {
298 dprintk("Overlap at 0x%Lx-0x%Lx\n", 311 dprintk("Overlap at 0x%Lx-0x%Lx\n",
299 entry->start, entry->end); 312 entry->start, entry->end);
300 where = &entry->nd; 313 cached_entry = list_entry(entry->nd.prev,
314 struct memtype, nd);
315
316 /*
317 * Move to right position in the linked
318 * list to add this new entry
319 */
320 list_for_each_entry_continue(entry,
321 &memtype_list, nd) {
322 if (start <= entry->start) {
323 where = entry->nd.prev;
324 break;
325 }
326 }
301 } 327 }
302 break; 328 break;
303 } 329 }
@@ -312,6 +338,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
312 return err; 338 return err;
313 } 339 }
314 340
341 cached_start = start;
342
315 if (where) 343 if (where)
316 list_add(&new->nd, where); 344 list_add(&new->nd, where);
317 else 345 else
@@ -341,6 +369,9 @@ int free_memtype(u64 start, u64 end)
341 spin_lock(&memtype_lock); 369 spin_lock(&memtype_lock);
342 list_for_each_entry(entry, &memtype_list, nd) { 370 list_for_each_entry(entry, &memtype_list, nd) {
343 if (entry->start == start && entry->end == end) { 371 if (entry->start == start && entry->end == end) {
372 if (cached_entry == entry || cached_start == start)
373 cached_entry = NULL;
374
344 list_del(&entry->nd); 375 list_del(&entry->nd);
345 kfree(entry); 376 kfree(entry);
346 err = 0; 377 err = 0;
@@ -359,22 +390,14 @@ int free_memtype(u64 start, u64 end)
359} 390}
360 391
361 392
362/*
363 * /dev/mem mmap interface. The memtype used for mapping varies:
364 * - Use UC for mappings with O_SYNC flag
365 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
366 * inherit the memtype from existing mapping.
367 * - Else use UC_MINUS memtype (for backward compatibility with existing
368 * X drivers.
369 */
370pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 393pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
371 unsigned long size, pgprot_t vma_prot) 394 unsigned long size, pgprot_t vma_prot)
372{ 395{
373 return vma_prot; 396 return vma_prot;
374} 397}
375 398
376#ifdef CONFIG_NONPROMISC_DEVMEM 399#ifdef CONFIG_STRICT_DEVMEM
377/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 400/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
378static inline int range_is_allowed(unsigned long pfn, unsigned long size) 401static inline int range_is_allowed(unsigned long pfn, unsigned long size)
379{ 402{
380 return 1; 403 return 1;
@@ -398,20 +421,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
398 } 421 }
399 return 1; 422 return 1;
400} 423}
401#endif /* CONFIG_NONPROMISC_DEVMEM */ 424#endif /* CONFIG_STRICT_DEVMEM */
402 425
403int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 426int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
404 unsigned long size, pgprot_t *vma_prot) 427 unsigned long size, pgprot_t *vma_prot)
405{ 428{
406 u64 offset = ((u64) pfn) << PAGE_SHIFT; 429 u64 offset = ((u64) pfn) << PAGE_SHIFT;
407 unsigned long flags = _PAGE_CACHE_UC_MINUS; 430 unsigned long flags = -1;
408 int retval; 431 int retval;
409 432
410 if (!range_is_allowed(pfn, size)) 433 if (!range_is_allowed(pfn, size))
411 return 0; 434 return 0;
412 435
413 if (file->f_flags & O_SYNC) { 436 if (file->f_flags & O_SYNC) {
414 flags = _PAGE_CACHE_UC; 437 flags = _PAGE_CACHE_UC_MINUS;
415 } 438 }
416 439
417#ifdef CONFIG_X86_32 440#ifdef CONFIG_X86_32
@@ -434,13 +457,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
434#endif 457#endif
435 458
436 /* 459 /*
437 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 460 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
461 *
438 * Without O_SYNC, we want to get 462 * Without O_SYNC, we want to get
439 * - WB for WB-able memory and no other conflicting mappings 463 * - WB for WB-able memory and no other conflicting mappings
440 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 464 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
441 * - Inherit from confliting mappings otherwise 465 * - Inherit from confliting mappings otherwise
442 */ 466 */
443 if (flags != _PAGE_CACHE_UC_MINUS) { 467 if (flags != -1) {
444 retval = reserve_memtype(offset, offset + size, flags, NULL); 468 retval = reserve_memtype(offset, offset + size, flags, NULL);
445 } else { 469 } else {
446 retval = reserve_memtype(offset, offset + size, -1, &flags); 470 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -489,3 +513,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
489 513
490 free_memtype(addr, addr + size); 514 free_memtype(addr, addr + size);
491} 515}
516
517#if defined(CONFIG_DEBUG_FS)
518
519/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos)
521{
522 struct memtype *list_node, *print_entry;
523 int i = 1;
524
525 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
526 if (!print_entry)
527 return NULL;
528
529 spin_lock(&memtype_lock);
530 list_for_each_entry(list_node, &memtype_list, nd) {
531 if (pos == i) {
532 *print_entry = *list_node;
533 spin_unlock(&memtype_lock);
534 return print_entry;
535 }
536 ++i;
537 }
538 spin_unlock(&memtype_lock);
539 kfree(print_entry);
540 return NULL;
541}
542
543static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
544{
545 if (*pos == 0) {
546 ++*pos;
547 seq_printf(seq, "PAT memtype list:\n");
548 }
549
550 return memtype_get_idx(*pos);
551}
552
553static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
554{
555 ++*pos;
556 return memtype_get_idx(*pos);
557}
558
559static void memtype_seq_stop(struct seq_file *seq, void *v)
560{
561}
562
563static int memtype_seq_show(struct seq_file *seq, void *v)
564{
565 struct memtype *print_entry = (struct memtype *)v;
566
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end);
569 kfree(print_entry);
570 return 0;
571}
572
573static struct seq_operations memtype_seq_ops = {
574 .start = memtype_seq_start,
575 .next = memtype_seq_next,
576 .stop = memtype_seq_stop,
577 .show = memtype_seq_show,
578};
579
580static int memtype_seq_open(struct inode *inode, struct file *file)
581{
582 return seq_open(file, &memtype_seq_ops);
583}
584
585static const struct file_operations memtype_fops = {
586 .open = memtype_seq_open,
587 .read = seq_read,
588 .llseek = seq_lseek,
589 .release = seq_release,
590};
591
592static int __init pat_memtype_list_init(void)
593{
594 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
595 NULL, &memtype_fops);
596 return 0;
597}
598
599late_initcall(pat_memtype_list_init);
600
601#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef8..d50302774fe2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
207 unsigned long addr; 207 unsigned long addr;
208 int i; 208 int i;
209 209
210 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
211 return;
212
210 pud = pud_offset(pgd, 0); 213 pud = pud_offset(pgd, 0);
211 214
212 for (addr = i = 0; i < PREALLOCATED_PMDS; 215 for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301c..16ae70fc57e7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
178 * start of the node, and that the current "end" address is after 178 * start of the node, and that the current "end" address is after
179 * the previous one. 179 * the previous one.
180 */ 180 */
181static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) 181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{ 182{
183 /* 183 /*
184 * Only add present memory as told by the e820. 184 * Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
189 if (memory_chunk->start_pfn >= max_pfn) { 189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", 190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn); 191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return; 192 return -1;
193 } 193 }
194 if (memory_chunk->nid != nid) 194 if (memory_chunk->nid != nid)
195 return; 195 return -1;
196 196
197 if (!node_has_online_mem(nid)) 197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn; 198 node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
202 202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn) 203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn; 204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
205} 207}
206 208
207int __init get_memcfg_from_srat(void) 209int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
259 printk(KERN_DEBUG 261 printk(KERN_DEBUG
260 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
261 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
262 node_read_chunk(chunk->nid, chunk); 264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
263 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
264 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
265 } 269 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7f3329b55d2e..8a5f1614a3d5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/moduleparam.h> 16#include <linux/moduleparam.h>
17#include <linux/kdebug.h> 17#include <linux/kdebug.h>
18#include <linux/cpu.h>
18#include <asm/nmi.h> 19#include <asm/nmi.h>
19#include <asm/msr.h> 20#include <asm/msr.h>
20#include <asm/apic.h> 21#include <asm/apic.h>
@@ -28,23 +29,48 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
28 29
29static int nmi_start(void); 30static int nmi_start(void);
30static void nmi_stop(void); 31static void nmi_stop(void);
32static void nmi_cpu_start(void *dummy);
33static void nmi_cpu_stop(void *dummy);
31 34
32/* 0 == registered but off, 1 == registered and on */ 35/* 0 == registered but off, 1 == registered and on */
33static int nmi_enabled = 0; 36static int nmi_enabled = 0;
34 37
38#ifdef CONFIG_SMP
39static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
40 void *data)
41{
42 int cpu = (unsigned long)data;
43 switch (action) {
44 case CPU_DOWN_FAILED:
45 case CPU_ONLINE:
46 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
47 break;
48 case CPU_DOWN_PREPARE:
49 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
50 break;
51 }
52 return NOTIFY_DONE;
53}
54
55static struct notifier_block oprofile_cpu_nb = {
56 .notifier_call = oprofile_cpu_notifier
57};
58#endif
59
35#ifdef CONFIG_PM 60#ifdef CONFIG_PM
36 61
37static int nmi_suspend(struct sys_device *dev, pm_message_t state) 62static int nmi_suspend(struct sys_device *dev, pm_message_t state)
38{ 63{
64 /* Only one CPU left, just stop that one */
39 if (nmi_enabled == 1) 65 if (nmi_enabled == 1)
40 nmi_stop(); 66 nmi_cpu_stop(NULL);
41 return 0; 67 return 0;
42} 68}
43 69
44static int nmi_resume(struct sys_device *dev) 70static int nmi_resume(struct sys_device *dev)
45{ 71{
46 if (nmi_enabled == 1) 72 if (nmi_enabled == 1)
47 nmi_start(); 73 nmi_cpu_start(NULL);
48 return 0; 74 return 0;
49} 75}
50 76
@@ -269,10 +295,12 @@ static void nmi_cpu_shutdown(void *dummy)
269 295
270static void nmi_shutdown(void) 296static void nmi_shutdown(void)
271{ 297{
272 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 298 struct op_msrs *msrs;
299
273 nmi_enabled = 0; 300 nmi_enabled = 0;
274 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 301 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
275 unregister_die_notifier(&profile_exceptions_nb); 302 unregister_die_notifier(&profile_exceptions_nb);
303 msrs = &get_cpu_var(cpu_msrs);
276 model->shutdown(msrs); 304 model->shutdown(msrs);
277 free_msrs(); 305 free_msrs();
278 put_cpu_var(cpu_msrs); 306 put_cpu_var(cpu_msrs);
@@ -369,20 +397,34 @@ static int __init ppro_init(char **cpu_type)
369{ 397{
370 __u8 cpu_model = boot_cpu_data.x86_model; 398 __u8 cpu_model = boot_cpu_data.x86_model;
371 399
372 if (cpu_model == 14) 400 switch (cpu_model) {
401 case 0 ... 2:
402 *cpu_type = "i386/ppro";
403 break;
404 case 3 ... 5:
405 *cpu_type = "i386/pii";
406 break;
407 case 6 ... 8:
408 *cpu_type = "i386/piii";
409 break;
410 case 9:
411 *cpu_type = "i386/p6_mobile";
412 break;
413 case 10 ... 13:
414 *cpu_type = "i386/p6";
415 break;
416 case 14:
373 *cpu_type = "i386/core"; 417 *cpu_type = "i386/core";
374 else if (cpu_model == 15 || cpu_model == 23) 418 break;
419 case 15: case 23:
375 *cpu_type = "i386/core_2"; 420 *cpu_type = "i386/core_2";
376 else if (cpu_model > 0xd) 421 break;
422 case 26:
423 *cpu_type = "i386/core_2";
424 break;
425 default:
426 /* Unknown */
377 return 0; 427 return 0;
378 else if (cpu_model == 9) {
379 *cpu_type = "i386/p6_mobile";
380 } else if (cpu_model > 5) {
381 *cpu_type = "i386/piii";
382 } else if (cpu_model > 2) {
383 *cpu_type = "i386/pii";
384 } else {
385 *cpu_type = "i386/ppro";
386 } 428 }
387 429
388 model = &op_ppro_spec; 430 model = &op_ppro_spec;
@@ -449,6 +491,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
449 } 491 }
450 492
451 init_sysfs(); 493 init_sysfs();
494#ifdef CONFIG_SMP
495 register_cpu_notifier(&oprofile_cpu_nb);
496#endif
452 using_nmi = 1; 497 using_nmi = 1;
453 ops->create_files = nmi_create_files; 498 ops->create_files = nmi_create_files;
454 ops->setup = nmi_setup; 499 ops->setup = nmi_setup;
@@ -462,6 +507,10 @@ int __init op_nmi_init(struct oprofile_operations *ops)
462 507
463void op_nmi_exit(void) 508void op_nmi_exit(void)
464{ 509{
465 if (using_nmi) 510 if (using_nmi) {
466 exit_sysfs(); 511 exit_sysfs();
512#ifdef CONFIG_SMP
513 unregister_cpu_notifier(&oprofile_cpu_nb);
514#endif
515 }
467} 516}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e515e8db842a..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7 7
8pci-y := fixup.o 8obj-y += fixup.o
9pci-$(CONFIG_ACPI) += acpi.o 9obj-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o 10obj-y += legacy.o irq.o
11 11
12pci-$(CONFIG_X86_VISWS) += visws.o 12obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14pci-$(CONFIG_X86_NUMAQ) += numa.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += $(pci-y) common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index dbf532369711..6a0fca78c362 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,6 +1,7 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h>
4#include "pci.h" 5#include "pci.h"
5 6
6#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
@@ -555,15 +556,17 @@ static int __init early_fill_mp_bus_info(void)
555 return 0; 556 return 0;
556} 557}
557 558
558postcore_initcall(early_fill_mp_bus_info); 559#else /* !CONFIG_X86_64 */
559 560
560#endif 561static int __init early_fill_mp_bus_info(void) { return 0; }
562
563#endif /* !CONFIG_X86_64 */
561 564
562/* common 32/64 bit code */ 565/* common 32/64 bit code */
563 566
564#define ENABLE_CF8_EXT_CFG (1ULL << 46) 567#define ENABLE_CF8_EXT_CFG (1ULL << 46)
565 568
566static void enable_pci_io_ecs_per_cpu(void *unused) 569static void enable_pci_io_ecs(void *unused)
567{ 570{
568 u64 reg; 571 u64 reg;
569 rdmsrl(MSR_AMD64_NB_CFG, reg); 572 rdmsrl(MSR_AMD64_NB_CFG, reg);
@@ -573,14 +576,51 @@ static void enable_pci_io_ecs_per_cpu(void *unused)
573 } 576 }
574} 577}
575 578
576static int __init enable_pci_io_ecs(void) 579static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu)
577{ 581{
582 int cpu = (long)hcpu;
583 switch(action) {
584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
587 break;
588 default:
589 break;
590 }
591 return NOTIFY_OK;
592}
593
594static struct notifier_block __cpuinitdata amd_cpu_notifier = {
595 .notifier_call = amd_cpu_notify,
596};
597
598static int __init pci_io_ecs_init(void)
599{
600 int cpu;
601
578 /* assume all cpus from fam10h have IO ECS */ 602 /* assume all cpus from fam10h have IO ECS */
579 if (boot_cpu_data.x86 < 0x10) 603 if (boot_cpu_data.x86 < 0x10)
580 return 0; 604 return 0;
581 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); 605
606 register_cpu_notifier(&amd_cpu_notifier);
607 for_each_online_cpu(cpu)
608 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
609 (void *)(long)cpu);
582 pci_probe |= PCI_HAS_IO_ECS; 610 pci_probe |= PCI_HAS_IO_ECS;
611
612 return 0;
613}
614
615static int __init amd_postcore_init(void)
616{
617 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
618 return 0;
619
620 early_fill_mp_bus_info();
621 pci_io_ecs_init();
622
583 return 0; 623 return 0;
584} 624}
585 625
586postcore_initcall(enable_pci_io_ecs); 626postcore_initcall(amd_postcore_init);
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 858dbe3399f9..86631ccbc25a 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -7,15 +7,13 @@
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
9 9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) 10u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{ 11{
14 u32 v; 12 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc); 14 v = inl(0xcfc);
17 if (v != 0xffffffff) 15 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); 16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v; 17 return v;
20} 18}
21 19
@@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
24 u8 v; 22 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3)); 24 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); 25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v; 26 return v;
29} 27}
30 28
@@ -33,28 +31,28 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
33 u16 v; 31 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2)); 33 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); 34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v; 35 return v;
38} 36}
39 37
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val) 39 u32 val)
42{ 40{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val); 41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc); 43 outl(val, 0xcfc);
46} 44}
47 45
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 47{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc + (offset&3)); 50 outb(val, 0xcfc + (offset&3));
53} 51}
54 52
55void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
56{ 54{
57 PDprintk("%x writing to %x: %x\n", slot, offset, val); 55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
58 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
59 outw(val, 0xcfc + (offset&2)); 57 outw(val, 0xcfc + (offset&2));
60} 58}
@@ -71,7 +69,7 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
71 int j; 69 int j;
72 u32 val; 70 u32 val;
73 71
74 printk("PCI: %02x:%02x:%02x", bus, slot, func); 72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
75 73
76 for (i = 0; i < 256; i += 4) { 74 for (i = 0; i < 256; i += 4) {
77 if (!(i & 0x0f)) 75 if (!(i & 0x0f))
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a336342..4bdaa590375d 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb);
27 if (busno) 28 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */ 29 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb) 30 if (suba < subb)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2aafb67dc5f1..8791fc55e715 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -128,10 +128,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
128 pr = pci_find_parent_resource(dev, r); 128 pr = pci_find_parent_resource(dev, r);
129 if (!r->start || !pr || 129 if (!r->start || !pr ||
130 request_resource(pr, r) < 0) { 130 request_resource(pr, r) < 0) {
131 printk(KERN_ERR "PCI: Cannot allocate " 131 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
132 "resource region %d "
133 "of bridge %s\n",
134 idx, pci_name(dev));
135 /* 132 /*
136 * Something is wrong with the region. 133 * Something is wrong with the region.
137 * Invalidate the resource to prevent 134 * Invalidate the resource to prevent
@@ -166,15 +163,13 @@ static void __init pcibios_allocate_resources(int pass)
166 else 163 else
167 disabled = !(command & PCI_COMMAND_MEMORY); 164 disabled = !(command & PCI_COMMAND_MEMORY);
168 if (pass == disabled) { 165 if (pass == disabled) {
169 DBG("PCI: Resource %08lx-%08lx " 166 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
170 "(f=%lx, d=%d, p=%d)\n", 167 (unsigned long long) r->start,
171 r->start, r->end, r->flags, disabled, pass); 168 (unsigned long long) r->end,
169 r->flags, disabled, pass);
172 pr = pci_find_parent_resource(dev, r); 170 pr = pci_find_parent_resource(dev, r);
173 if (!pr || request_resource(pr, r) < 0) { 171 if (!pr || request_resource(pr, r) < 0) {
174 printk(KERN_ERR "PCI: Cannot allocate " 172 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
175 "resource region %d "
176 "of device %s\n",
177 idx, pci_name(dev));
178 /* We'll assign a new address later */ 173 /* We'll assign a new address later */
179 r->end -= r->start; 174 r->end -= r->start;
180 r->start = 0; 175 r->start = 0;
@@ -187,8 +182,7 @@ static void __init pcibios_allocate_resources(int pass)
187 /* Turn the ROM off, leave the resource region, 182 /* Turn the ROM off, leave the resource region,
188 * but keep it unregistered. */ 183 * but keep it unregistered. */
189 u32 reg; 184 u32 reg;
190 DBG("PCI: Switching off ROM of %s\n", 185 dev_dbg(&dev->dev, "disabling ROM\n");
191 pci_name(dev));
192 r->flags &= ~IORESOURCE_ROM_ENABLE; 186 r->flags &= ~IORESOURCE_ROM_ENABLE;
193 pci_read_config_dword(dev, 187 pci_read_config_dword(dev,
194 dev->rom_base_reg, &reg); 188 dev->rom_base_reg, &reg);
@@ -257,8 +251,7 @@ void pcibios_set_master(struct pci_dev *dev)
257 lat = pcibios_max_latency; 251 lat = pcibios_max_latency;
258 else 252 else
259 return; 253 return;
260 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", 254 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
261 pci_name(dev), lat);
262 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 255 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
263} 256}
264 257
@@ -280,6 +273,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 273static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 274 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 275 .close = pci_unmap_page_range,
276 .access = generic_access_phys,
283}; 277};
284 278
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 279int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 6a06a2eb0597..8e077185e185 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
436{ 436{
437 WARN_ON_ONCE(pirq >= 9); 437 WARN_ON_ONCE(pirq >= 9);
438 if (pirq > 8) { 438 if (pirq > 8) {
439 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 439 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
440 return 0; 440 return 0;
441 } 441 }
442 return read_config_nybble(router, 0x74, pirq-1); 442 return read_config_nybble(router, 0x74, pirq-1);
@@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
446{ 446{
447 WARN_ON_ONCE(pirq >= 9); 447 WARN_ON_ONCE(pirq >= 9);
448 if (pirq > 8) { 448 if (pirq > 8) {
449 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 449 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
450 return 0; 450 return 0;
451 } 451 }
452 write_config_nybble(router, 0x74, pirq-1, irq); 452 write_config_nybble(router, 0x74, pirq-1, irq);
@@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
492 irq = 0; 492 irq = 0;
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 495 dev_info(&dev->dev,
496 dev->vendor, dev->device, pirq, irq); 496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq);
497 return irq; 498 return irq;
498} 499}
499 500
500static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
501{ 502{
502 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 503 dev_info(&dev->dev,
503 dev->vendor, dev->device, pirq, irq); 504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq);
504 if (pirq <= 4) 506 if (pirq <= 4)
505 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
506 return 1; 508 return 1;
@@ -588,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
588 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
589 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
590 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_PCH_0:
594 case PCI_DEVICE_ID_INTEL_PCH_1:
591 r->name = "PIIX/ICH"; 595 r->name = "PIIX/ICH";
592 r->get = pirq_piix_get; 596 r->get = pirq_piix_get;
593 r->set = pirq_piix_set; 597 r->set = pirq_piix_set;
@@ -730,7 +734,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
730 switch (device) { 734 switch (device) {
731 case PCI_DEVICE_ID_AL_M1533: 735 case PCI_DEVICE_ID_AL_M1533:
732 case PCI_DEVICE_ID_AL_M1563: 736 case PCI_DEVICE_ID_AL_M1563:
733 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
734 r->name = "ALI"; 737 r->name = "ALI";
735 r->get = pirq_ali_get; 738 r->get = pirq_ali_get;
736 r->set = pirq_ali_set; 739 r->set = pirq_ali_set;
@@ -840,11 +843,9 @@ static void __init pirq_find_router(struct irq_router *r)
840 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 843 h->probe(r, pirq_router_dev, pirq_router_dev->device))
841 break; 844 break;
842 } 845 }
843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 846 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
844 pirq_router.name, 847 pirq_router.name,
845 pirq_router_dev->vendor, 848 pirq_router_dev->vendor, pirq_router_dev->device);
846 pirq_router_dev->device,
847 pci_name(pirq_router_dev));
848 849
849 /* The device remains referenced for the kernel lifetime */ 850 /* The device remains referenced for the kernel lifetime */
850} 851}
@@ -877,7 +878,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
877 /* Find IRQ pin */ 878 /* Find IRQ pin */
878 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 879 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
879 if (!pin) { 880 if (!pin) {
880 DBG(KERN_DEBUG " -> no interrupt pin\n"); 881 dev_dbg(&dev->dev, "no interrupt pin\n");
881 return 0; 882 return 0;
882 } 883 }
883 pin = pin - 1; 884 pin = pin - 1;
@@ -887,20 +888,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
887 if (!pirq_table) 888 if (!pirq_table)
888 return 0; 889 return 0;
889 890
890 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
891 info = pirq_get_info(dev); 891 info = pirq_get_info(dev);
892 if (!info) { 892 if (!info) {
893 DBG(" -> not found in routing table\n" KERN_DEBUG); 893 dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
894 'A' + pin);
894 return 0; 895 return 0;
895 } 896 }
896 pirq = info->irq[pin].link; 897 pirq = info->irq[pin].link;
897 mask = info->irq[pin].bitmap; 898 mask = info->irq[pin].bitmap;
898 if (!pirq) { 899 if (!pirq) {
899 DBG(" -> not routed\n" KERN_DEBUG); 900 dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
900 return 0; 901 return 0;
901 } 902 }
902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, 903 dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
903 pirq_table->exclusive_irqs); 904 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
904 mask &= pcibios_irq_mask; 905 mask &= pcibios_irq_mask;
905 906
906 /* Work around broken HP Pavilion Notebooks which assign USB to 907 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -930,10 +931,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
930 if (pci_probe & PCI_USE_PIRQ_MASK) 931 if (pci_probe & PCI_USE_PIRQ_MASK)
931 newirq = 0; 932 newirq = 0;
932 else 933 else
933 printk("\n" KERN_WARNING 934 dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" 935 "%#x; try pci=usepirqmask\n", newirq, mask);
935 KERN_DEBUG, newirq,
936 pci_name(dev));
937 } 936 }
938 if (!newirq && assign) { 937 if (!newirq && assign) {
939 for (i = 0; i < 16; i++) { 938 for (i = 0; i < 16; i++) {
@@ -944,39 +943,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
944 newirq = i; 943 newirq = i;
945 } 944 }
946 } 945 }
947 DBG(" -> newirq=%d", newirq); 946 dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
948 947
949 /* Check if it is hardcoded */ 948 /* Check if it is hardcoded */
950 if ((pirq & 0xf0) == 0xf0) { 949 if ((pirq & 0xf0) == 0xf0) {
951 irq = pirq & 0xf; 950 irq = pirq & 0xf;
952 DBG(" -> hardcoded IRQ %d\n", irq); 951 msg = "hardcoded";
953 msg = "Hardcoded";
954 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 952 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
955 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { 953 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
956 DBG(" -> got IRQ %d\n", irq); 954 msg = "found";
957 msg = "Found";
958 eisa_set_level_irq(irq); 955 eisa_set_level_irq(irq);
959 } else if (newirq && r->set && 956 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 957 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
961 DBG(" -> assigning IRQ %d", newirq);
962 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 958 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
963 eisa_set_level_irq(newirq); 959 eisa_set_level_irq(newirq);
964 DBG(" ... OK\n"); 960 msg = "assigned";
965 msg = "Assigned";
966 irq = newirq; 961 irq = newirq;
967 } 962 }
968 } 963 }
969 964
970 if (!irq) { 965 if (!irq) {
971 DBG(" ... failed\n");
972 if (newirq && mask == (1 << newirq)) { 966 if (newirq && mask == (1 << newirq)) {
973 msg = "Guessed"; 967 msg = "guessed";
974 irq = newirq; 968 irq = newirq;
975 } else 969 } else {
970 dev_dbg(&dev->dev, "can't route interrupt\n");
976 return 0; 971 return 0;
972 }
977 } 973 }
978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, 974 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
979 pci_name(dev));
980 975
981 /* Update IRQ for all devices with the same pirq value */ 976 /* Update IRQ for all devices with the same pirq value */
982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 977 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -996,17 +991,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 991 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
997 ((1 << dev2->irq) & mask))) { 992 ((1 << dev2->irq) & mask))) {
998#ifndef CONFIG_PCI_MSI 993#ifndef CONFIG_PCI_MSI
999 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 994 dev_info(&dev2->dev, "IRQ routing conflict: "
1000 pci_name(dev2), dev2->irq, irq); 995 "have IRQ %d, want IRQ %d\n",
996 dev2->irq, irq);
1001#endif 997#endif
1002 continue; 998 continue;
1003 } 999 }
1004 dev2->irq = irq; 1000 dev2->irq = irq;
1005 pirq_penalty[irq]++; 1001 pirq_penalty[irq]++;
1006 if (dev != dev2) 1002 if (dev != dev2)
1007 printk(KERN_INFO 1003 dev_info(&dev->dev, "sharing IRQ %d with %s\n",
1008 "PCI: Sharing IRQ %d with %s\n", 1004 irq, pci_name(dev2));
1009 irq, pci_name(dev2));
1010 } 1005 }
1011 } 1006 }
1012 return 1; 1007 return 1;
@@ -1025,8 +1020,7 @@ static void __init pcibios_fixup_irqs(void)
1025 * already in use. 1020 * already in use.
1026 */ 1021 */
1027 if (dev->irq >= 16) { 1022 if (dev->irq >= 16) {
1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", 1023 dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
1029 pci_name(dev), dev->irq);
1030 dev->irq = 0; 1024 dev->irq = 0;
1031 } 1025 }
1032 /* 1026 /*
@@ -1070,12 +1064,12 @@ static void __init pcibios_fixup_irqs(void)
1070 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1064 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1071 PCI_SLOT(bridge->devfn), pin); 1065 PCI_SLOT(bridge->devfn), pin);
1072 if (irq >= 0) 1066 if (irq >= 0)
1073 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1067 dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
1074 pci_name(bridge), 'A' + pin, irq); 1068 pci_name(bridge),
1069 'A' + pin, irq);
1075 } 1070 }
1076 if (irq >= 0) { 1071 if (irq >= 0) {
1077 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1072 dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
1078 pci_name(dev), 'A' + pin, irq);
1079 dev->irq = irq; 1073 dev->irq = irq;
1080 } 1074 }
1081 } 1075 }
@@ -1231,25 +1225,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1225 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1232 PCI_SLOT(bridge->devfn), pin); 1226 PCI_SLOT(bridge->devfn), pin);
1233 if (irq >= 0) 1227 if (irq >= 0)
1234 printk(KERN_WARNING 1228 dev_warn(&dev->dev, "using bridge %s "
1235 "PCI: using PPB %s[%c] to get irq %d\n", 1229 "INT %c to get IRQ %d\n",
1236 pci_name(bridge), 1230 pci_name(bridge), 'A' + pin,
1237 'A' + pin, irq); 1231 irq);
1238 dev = bridge; 1232 dev = bridge;
1239 } 1233 }
1240 dev = temp_dev; 1234 dev = temp_dev;
1241 if (irq >= 0) { 1235 if (irq >= 0) {
1242 printk(KERN_INFO 1236 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1237 "INT %c -> IRQ %d\n", 'A' + pin, irq);
1244 pci_name(dev), 'A' + pin, irq);
1245 dev->irq = irq; 1238 dev->irq = irq;
1246 return 0; 1239 return 0;
1247 } else 1240 } else
1248 msg = " Probably buggy MP table."; 1241 msg = "; probably buggy MP table";
1249 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1242 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1250 msg = ""; 1243 msg = "";
1251 else 1244 else
1252 msg = " Please try using pci=biosirq."; 1245 msg = "; please try using pci=biosirq";
1253 1246
1254 /* 1247 /*
1255 * With IDE legacy devices the IRQ lookup failure is not 1248 * With IDE legacy devices the IRQ lookup failure is not
@@ -1259,9 +1252,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1259 !(dev->class & 0x5)) 1252 !(dev->class & 0x5))
1260 return 0; 1253 return 0;
1261 1254
1262 printk(KERN_WARNING 1255 dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1256 'A' + pin, msg);
1264 'A' + pin, pci_name(dev), msg);
1265 } 1257 }
1266 return 0; 1258 return 0;
1267} 1259}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 132876cc6fca..b722dd481b39 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void)
14 int n, devfn; 14 int n, devfn;
15 long node; 15 long node;
16 16
17 if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) 17 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
18 return; 18 return;
19 DBG("PCI: Peer bridge fixup\n"); 19 DBG("PCI: Peer bridge fixup\n");
20 20
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
57 57
58int __init pci_subsys_init(void) 58int __init pci_subsys_init(void)
59{ 59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 pci_acpi_init(); 64 pci_acpi_init();
62#endif 65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
63 pci_legacy_init(); 69 pci_legacy_init();
64 pcibios_irq_init(); 70 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init(); 71 pcibios_init();
69 72
70 return 0; 73 return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 23faaa890ffc..d9635764ce3d 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
293 return AE_OK; 293 return AE_OK;
294} 294}
295 295
296static int __init is_acpi_reserved(unsigned long start, unsigned long end) 296static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
297{ 297{
298 struct resource mcfg_res; 298 struct resource mcfg_res;
299 299
@@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end)
310 return mcfg_res.flags; 310 return mcfg_res.flags;
311} 311}
312 312
313typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
314
315static int __init is_mmconf_reserved(check_reserved_t is_reserved,
316 u64 addr, u64 size, int i,
317 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
318{
319 u64 old_size = size;
320 int valid = 0;
321
322 while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
323 size >>= 1;
324 if (size < (16UL<<20))
325 break;
326 }
327
328 if (size >= (16UL<<20) || size == old_size) {
329 printk(KERN_NOTICE
330 "PCI: MCFG area at %Lx reserved in %s\n",
331 addr, with_e820?"E820":"ACPI motherboard resources");
332 valid = 1;
333
334 if (old_size != size) {
335 /* update end_bus_number */
336 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1);
337 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx "
338 "segment %hu buses %u - %u\n",
339 i, (unsigned long)cfg->address, cfg->pci_segment,
340 (unsigned int)cfg->start_bus_number,
341 (unsigned int)cfg->end_bus_number);
342 }
343 }
344
345 return valid;
346}
347
313static void __init pci_mmcfg_reject_broken(int early) 348static void __init pci_mmcfg_reject_broken(int early)
314{ 349{
315 typeof(pci_mmcfg_config[0]) *cfg; 350 typeof(pci_mmcfg_config[0]) *cfg;
@@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early)
324 359
325 for (i = 0; i < pci_mmcfg_config_num; i++) { 360 for (i = 0; i < pci_mmcfg_config_num; i++) {
326 int valid = 0; 361 int valid = 0;
327 u32 size = (cfg->end_bus_number + 1) << 20; 362 u64 addr, size;
363
328 cfg = &pci_mmcfg_config[i]; 364 cfg = &pci_mmcfg_config[i];
365 addr = cfg->start_bus_number;
366 addr <<= 20;
367 addr += cfg->address;
368 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
369 size <<= 20;
329 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " 370 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
330 "segment %hu buses %u - %u\n", 371 "segment %hu buses %u - %u\n",
331 i, (unsigned long)cfg->address, cfg->pci_segment, 372 i, (unsigned long)cfg->address, cfg->pci_segment,
332 (unsigned int)cfg->start_bus_number, 373 (unsigned int)cfg->start_bus_number,
333 (unsigned int)cfg->end_bus_number); 374 (unsigned int)cfg->end_bus_number);
334 375
335 if (!early && 376 if (!early)
336 is_acpi_reserved(cfg->address, cfg->address + size - 1)) { 377 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
337 printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved "
338 "in ACPI motherboard resources\n",
339 cfg->address);
340 valid = 1;
341 }
342 378
343 if (valid) 379 if (valid)
344 continue; 380 continue;
@@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early)
347 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 383 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
348 " reserved in ACPI motherboard resources\n", 384 " reserved in ACPI motherboard resources\n",
349 cfg->address); 385 cfg->address);
386
350 /* Don't try to do this check unless configuration 387 /* Don't try to do this check unless configuration
351 type 1 is available. how about type 2 ?*/ 388 type 1 is available. how about type 2 ?*/
352 if (raw_pci_ops && e820_all_mapped(cfg->address, 389 if (raw_pci_ops)
353 cfg->address + size - 1, 390 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1);
354 E820_RESERVED)) {
355 printk(KERN_NOTICE
356 "PCI: MCFG area at %Lx reserved in E820\n",
357 cfg->address);
358 valid = 1;
359 }
360 391
361 if (!valid) 392 if (!valid)
362 goto reject; 393 goto reject;
@@ -365,7 +396,7 @@ static void __init pci_mmcfg_reject_broken(int early)
365 return; 396 return;
366 397
367reject: 398reject:
368 printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); 399 printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
369 pci_mmcfg_arch_free(); 400 pci_mmcfg_arch_free();
370 kfree(pci_mmcfg_config); 401 kfree(pci_mmcfg_config);
371 pci_mmcfg_config = NULL; 402 pci_mmcfg_config = NULL;
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index 8b5ca1966731..1177845d3186 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
@@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
131 u8 busno, suba, subb; 131 u8 busno, suba, subb;
132 int quad = BUS2QUAD(d->bus->number); 132 int quad = BUS2QUAD(d->bus->number);
133 133
134 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 134 dev_info(&d->dev, "searching for i450NX host bridges\n");
135 reg = 0xd0; 135 reg = 0xd0;
136 for(pxb=0; pxb<2; pxb++) { 136 for(pxb=0; pxb<2; pxb++) {
137 pci_read_config_byte(d, reg++, &busno); 137 pci_read_config_byte(d, reg++, &busno);
138 pci_read_config_byte(d, reg++, &suba); 138 pci_read_config_byte(d, reg++, &suba);
139 pci_read_config_byte(d, reg++, &subb); 139 pci_read_config_byte(d, reg++, &subb);
140 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 140 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
141 pxb, busno, suba, subb);
141 if (busno) { 142 if (busno) {
142 /* Bus A */ 143 /* Bus A */
143 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); 144 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
@@ -151,7 +152,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 152}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 153DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 154
154int __init pci_numa_init(void) 155int __init pci_numaq_init(void)
155{ 156{
156 int quad; 157 int quad;
157 158
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3e25deb821ac..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
108/* some common used subsys_initcalls */ 108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void); 109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void); 110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void); 111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
112extern int __init pcibios_init(void); 113extern int __init pcibios_init(void);
113 114
114/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 1a7bed492bb1..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
87} 87}
88 88
89static int __init pci_visws_init(void) 89int __init pci_visws_init(void)
90{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
91 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
92 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
93 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
105 pcibios_resource_survey(); 111 pcibios_resource_survey();
106 return 0; 112 return 0;
107} 113}
108
109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7dc5d5cf50a2..d3e083dea720 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -45,7 +45,7 @@ static void __save_processor_state(struct saved_context *ctxt)
45 ctxt->cr0 = read_cr0(); 45 ctxt->cr0 = read_cr0();
46 ctxt->cr2 = read_cr2(); 46 ctxt->cr2 = read_cr2();
47 ctxt->cr3 = read_cr3(); 47 ctxt->cr3 = read_cr3();
48 ctxt->cr4 = read_cr4(); 48 ctxt->cr4 = read_cr4_safe();
49} 49}
50 50
51/* Needed by apm.c */ 51/* Needed by apm.c */
@@ -98,7 +98,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
98 /* 98 /*
99 * control registers 99 * control registers
100 */ 100 */
101 write_cr4(ctxt->cr4); 101 /* cr4 was introduced in the Pentium CPU */
102 if (ctxt->cr4)
103 write_cr4(ctxt->cr4);
102 write_cr3(ctxt->cr3); 104 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 105 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 106 write_cr0(ctxt->cr0);
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b95aa6cfe3cb..4fc7e872c85e 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -28,9 +28,9 @@ ENTRY(swsusp_arch_suspend)
28 ret 28 ret
29 29
30ENTRY(restore_image) 30ENTRY(restore_image)
31 movl resume_pg_dir, %ecx 31 movl resume_pg_dir, %eax
32 subl $__PAGE_OFFSET, %ecx 32 subl $__PAGE_OFFSET, %eax
33 movl %ecx, %cr3 33 movl %eax, %cr3
34 34
35 movl restore_pblist, %edx 35 movl restore_pblist, %edx
36 .p2align 4,,7 36 .p2align 4,,7
@@ -52,17 +52,21 @@ copy_loop:
52 52
53done: 53done:
54 /* go back to the original page tables */ 54 /* go back to the original page tables */
55 movl $swapper_pg_dir, %ecx 55 movl $swapper_pg_dir, %eax
56 subl $__PAGE_OFFSET, %ecx 56 subl $__PAGE_OFFSET, %eax
57 movl %ecx, %cr3 57 movl %eax, %cr3
58 /* Flush TLB, including "global" things (vmalloc) */ 58 /* Flush TLB, including "global" things (vmalloc) */
59 movl mmu_cr4_features, %eax 59 movl mmu_cr4_features, %ecx
60 movl %eax, %edx 60 jecxz 1f # cr4 Pentium and higher, skip if zero
61 movl %ecx, %edx
61 andl $~(1<<7), %edx; # PGE 62 andl $~(1<<7), %edx; # PGE
62 movl %edx, %cr4; # turn off PGE 63 movl %edx, %cr4; # turn off PGE
63 movl %cr3, %ecx; # flush TLB 641:
64 movl %ecx, %cr3 65 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr4; # turn PGE back on 66 movl %eax, %cr3
67 jecxz 1f # cr4 Pentium and higher, skip if zero
68 movl %ecx, %cr4; # turn PGE back on
691:
66 70
67 movl saved_context_esp, %esp 71 movl saved_context_esp, %esp
68 movl saved_context_ebp, %ebp 72 movl saved_context_ebp, %ebp
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 19a6cfaf5db9..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..a4e201b47f64 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -167,10 +180,14 @@ void xen_vcpu_restore(void)
167 180
168static void __init xen_banner(void) 181static void __init xen_banner(void)
169{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
170 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
171 pv_info.name); 188 pv_info.name);
172 printk(KERN_INFO "Hypervisor signature: %s%s\n", 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
173 xen_start_info->magic, 190 version >> 16, version & 0xffff, extra.extraversion,
174 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
175} 192}
176 193
@@ -363,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 380
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 382{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 383 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -379,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
382 */ 399 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
385} 422}
423#endif
386 424
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 426 const void *ptr)
@@ -400,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 438 preempt_enable();
401} 439}
402 440
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 442 struct trap_info *info)
405{ 443{
406 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 445 return 0;
413 446
414 info->vector = vector; 447 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
416 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
417 info->flags = dpl; 450 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
419 if (type == 0xe) 452 if (val->type == 0xe)
420 info->flags |= 4; 453 info->flags |= 4;
421 454
422 return 1; 455 return 1;
@@ -443,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 476
444 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 478 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 479
448 info[1].address = 0; 480 info[1].address = 0;
449 481
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 484 BUG();
453 } 485 }
@@ -460,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 492{
461 unsigned in, out, count; 493 unsigned in, out, count;
462 494
463 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 496 BUG_ON(count > 256);
465 497
466 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 500
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 502 out++;
471 } 503 }
472 traps[out].address = 0; 504 traps[out].address = 0;
@@ -695,33 +727,89 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 728}
697 729
698static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 731{
700 struct mmuext_op *op; 732 struct mmuext_op *op;
701 struct multicall_space mcs; 733 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
703 735
704 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
705 740
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
707 742
708 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 744
712 op = mcs.args; 745 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
715 748
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 750
718 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
719 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
721 781
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 783}
724 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
725/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 814 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 867}
780 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
781/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
783{ 913{
@@ -803,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
803 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
804} 934}
805 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
806#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 950{
@@ -841,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
841 983
842static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 985{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 986}
890 987
891void xen_setup_shared_info(void) 988void xen_setup_shared_info(void)
892{ 989{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 992 xen_start_info->shared_info);
896 /* 993
897 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 996 } else
907 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1007,32 @@ void xen_setup_shared_info(void)
917 1007
918static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1009{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1010 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1011}
934 1012
935static __init void xen_post_allocator_init(void) 1013static __init void xen_post_allocator_init(void)
936{ 1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1016 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
1022 /* This will work as long as patching hasn't happened yet
1023 (which it hasn't) */
1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1026 pv_mmu_ops.release_pte = xen_release_pte;
1027 pv_mmu_ops.release_pmd = xen_release_pmd;
1028#if PAGETABLE_LEVELS == 4
1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
939 1032
1033#ifdef CONFIG_X86_64
1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
1035#endif
940 xen_mark_init_mm_pinned(); 1036 xen_mark_init_mm_pinned();
941} 1037}
942 1038
@@ -950,6 +1046,7 @@ void xen_setup_vcpu_info_placement(void)
950 1046
951 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
952 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
953 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
954 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
955 1052
@@ -959,6 +1056,7 @@ void xen_setup_vcpu_info_placement(void)
959 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
960 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
961 } 1058 }
1059#endif
962} 1060}
963 1061
964static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
979 goto patch_site 1077 goto patch_site
980 1078
981 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
982 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
983 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
984 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
985 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
986#undef SITE 1086#undef SITE
987 1087
988 patch_site: 1088 patch_site:
@@ -1025,8 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1125#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1126 case FIX_F00F_IDT:
1027#endif 1127#endif
1128#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1129 case FIX_WP_TEST:
1029 case FIX_VDSO: 1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1137#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1139#endif
@@ -1039,6 +1146,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1146 }
1040 1147
1041 __native_set_fixmap(idx, pte); 1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1042} 1158}
1043 1159
1044static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
1085 1201
1086 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
1090 1206
1091 .iret = xen_iret, 1207 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
1093 1213
1094 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1099 1222
1100 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1109,14 +1232,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1111 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1112 .lazy_mode = { 1238 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1115 }, 1241 },
1116}; 1242};
1117 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1118static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1120 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
@@ -1124,14 +1267,13 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1124 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt, 1268 .halt = xen_halt,
1126#ifdef CONFIG_X86_64 1269#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop, 1270 .adjust_exception_frame = xen_adjust_exception_frame,
1128#endif 1271#endif
1129}; 1272};
1130 1273
1131static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1132#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1133 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1134 .apic_write_atomic = xen_apic_write,
1135 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1136 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1137 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1157,8 +1299,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1159 1301
1160 .pgd_alloc = __paravirt_pgd_alloc, 1302 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1303 .pgd_free = xen_pgd_free,
1162 1304
1163 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
@@ -1170,7 +1312,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1313#endif
1172 1314
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1174 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1321 .set_pmd = xen_set_pmd_hyper,
1176 1322
@@ -1178,21 +1324,32 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1178 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1179 1325
1180 .pte_val = xen_pte_val, 1326 .pte_val = xen_pte_val,
1181 .pte_flags = native_pte_val, 1327 .pte_flags = native_pte_flags,
1182 .pgd_val = xen_pgd_val, 1328 .pgd_val = xen_pgd_val,
1183 1329
1184 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1186 1332
1333#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1192 1340
1193 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1195 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1196 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1362,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1362 .set_fixmap = xen_set_fixmap,
1206}; 1363};
1207 1364
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1224{ 1366{
1225 struct sched_shutdown r = { .reason = reason }; 1367 struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1406
1265static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1266{ 1408{
1409#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1269 1412
@@ -1271,8 +1414,248 @@ static void __init xen_reserve_top(void)
1271 top = pp.virt_start; 1414 top = pp.virt_start;
1272 1415
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1274} 1431}
1275 1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_PFN_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1442}
1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1276/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1278{ 1661{
@@ -1301,53 +1684,56 @@ asmlinkage void __init xen_start_kernel(void)
1301 1684
1302 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1303 1686
1304#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1306#endif 1691#endif
1307 1692
1693 xen_smp_init();
1694
1308 /* Get mfn list */ 1695 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1697 xen_build_dynamic_phys_to_machine();
1311 1698
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1700
1314 init_pg_tables_start = __pa(pgd); 1701 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1703 if (!is_initial_xendomain())
1317 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1705
1325 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1329 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1332 1720
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1339 xen_reserve_top(); 1722 xen_reserve_top();
1340 1723
1724#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1345 1730
1346 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1737
1352 if (!is_initial_xendomain()) { 1738 if (!is_initial_xendomain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1739 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1741,21 @@ asmlinkage void __init xen_start_kernel(void)
1355 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1356 } 1742 }
1357 1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1358 /* Start the world */ 1755 /* Start the world */
1756#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1360} 1761}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..aa37469da696 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -364,8 +343,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
364static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
365{ 344{
366 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
367 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
368 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & PTE_FLAGS_MASK;
369 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
370 } 349 }
371 350
@@ -375,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
375static pteval_t pte_pfn_to_mfn(pteval_t val) 354static pteval_t pte_pfn_to_mfn(pteval_t val)
376{ 355{
377 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
378 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
379 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & PTE_FLAGS_MASK;
380 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
381 } 360 }
382 361
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void)
42 42
43 e820.nr_map = 0; 43 e820.nr_map = 0;
44 44
45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46 46
47 /* 47 /*
48 * Even though this is normal, usable memory under Xen, reserve 48 * Even though this is normal, usable memory under Xen, reserve
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..d8faf79a0a1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,6 +36,8 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
38cpumask_t xen_cpu_initialized_map; 41cpumask_t xen_cpu_initialized_map;
39 42
40static DEFINE_PER_CPU(int, resched_irq); 43static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
67 70
68 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
69 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
70 76
71 preempt_disable(); 77 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
73 81
74 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
75 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
76 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 89 local_irq_enable();
78 90
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 153 return rc;
142} 154}
143 155
144void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
145{ 157{
146 int i, rc; 158 int i, rc;
147 159
148 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
151 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
152 } 166 }
153} 167}
154 168
155void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
156{ 170{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
161 173
162 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 175 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 177
176 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
177} 179}
178 180
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 182{
181 unsigned cpu; 183 unsigned cpu;
182 184
183 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 186
193 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
195 190
196 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 221{
227 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
229 224
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 226 return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 229 if (ctxt == NULL)
235 return -ENOMEM; 230 return -ENOMEM;
236 231
232 gdt = get_cpu_gdt_table(cpu);
233
237 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 243
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 247
250 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
251 249
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
254 252
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
257 255
258 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
263 261
262#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 268
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 276 return 0;
277} 277}
278 278
279int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 280{
281 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
282 int rc; 282 int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 287 return rc;
288#endif 288#endif
289 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
290 init_gdt(cpu); 301 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
293 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 312
295 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 324 if (rc)
307 return rc; 325 return rc;
308 326
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 328 BUG_ON(rc);
318 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
319 return 0; 335 return 0;
320} 336}
321 337
322void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 339{
324} 340}
325 341
@@ -335,12 +351,12 @@ static void stop_self(void *v)
335 BUG(); 351 BUG();
336} 352}
337 353
338void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
339{ 355{
340 smp_call_function(stop_self, NULL, 0); 356 smp_call_function(stop_self, NULL, 0);
341} 357}
342 358
343void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
344{ 360{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 362}
@@ -351,18 +367,18 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
351 367
352 cpus_and(mask, mask, cpu_online_map); 368 cpus_and(mask, mask, cpu_online_map);
353 369
354 for_each_cpu_mask(cpu, mask) 370 for_each_cpu_mask_nr(cpu, mask)
355 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
356} 372}
357 373
358void xen_smp_send_call_function_ipi(cpumask_t mask) 374static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 375{
360 int cpu; 376 int cpu;
361 377
362 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
363 379
364 /* Make sure other vcpus get a chance to run if they need to. */ 380 /* Make sure other vcpus get a chance to run if they need to. */
365 for_each_cpu_mask(cpu, mask) { 381 for_each_cpu_mask_nr(cpu, mask) {
366 if (xen_vcpu_stolen(cpu)) { 382 if (xen_vcpu_stolen(cpu)) {
367 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 383 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
368 break; 384 break;
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 386 }
371} 387}
372 388
373void xen_smp_send_call_function_single_ipi(int cpu) 389static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 390{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 392}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 395{
380 irq_enter(); 396 irq_enter();
381 generic_smp_call_function_interrupt(); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
383 irq_exit(); 403 irq_exit();
384 404
385 return IRQ_HANDLED; 405 return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 409{
390 irq_enter(); 410 irq_enter();
391 generic_smp_call_function_single_interrupt(); 411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
393 irq_exit(); 417 irq_exit();
394 418
395 return IRQ_HANDLED; 419 return IRQ_HANDLED;
396} 420}
421
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
591
592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..7f58304fafb3
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $0
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */