aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-13 05:05:51 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-13 05:05:51 -0400
commitaccba5f3965d6a9d1bf7c1e1a7995d17e9d521b6 (patch)
tree8fb40782e79472ed882ff2098d4dd295557278ee /arch/x86
parent6852fd9b86d05063c6ef49d2e12e061cc7f6a105 (diff)
parent4480f15b3306f43bbb0310d461142b4e897ca45b (diff)
Merge branch 'linus' into oprofile-v2
Conflicts: arch/x86/kernel/apic_32.c arch/x86/oprofile/nmi_int.c include/linux/pci_ids.h
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig245
-rw-r--r--arch/x86/Kconfig.cpu133
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/Makefile_32.cpu5
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/cpu.c20
-rw-r--r--arch/x86/boot/cpucheck.c18
-rw-r--r--arch/x86/boot/edd.c7
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/main.c5
-rw-r--r--arch/x86/boot/memory.c1
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/configs/i386_defconfig317
-rw-r--r--arch/x86/configs/x86_64_defconfig280
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c17
-rw-r--r--arch/x86/ia32/ia32_signal.c138
-rw-r--r--arch/x86/ia32/sys_ia32.c9
-rw-r--r--arch/x86/kernel/Makefile15
-rw-r--r--arch/x86/kernel/acpi/boot.c53
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/alternative.c44
-rw-r--r--arch/x86/kernel/amd_iommu.c384
-rw-r--r--arch/x86/kernel/amd_iommu_init.c220
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c461
-rw-r--r--arch/x86/kernel/apic_64.c633
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c105
-rw-r--r--arch/x86/kernel/cpu/amd.c547
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/bugs.c6
-rw-r--r--arch/x86/kernel/cpu/centaur.c15
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c988
-rw-r--r--arch/x86/kernel/cpu/common_64.c670
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c3
-rw-r--r--arch/x86/kernel/cpu/cyrix.c73
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c365
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c172
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c18
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c281
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c94
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c16
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/e820.c30
-rw-r--r--arch/x86/kernel/early-quirks.c66
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)87
-rw-r--r--arch/x86/kernel/genapic_64.c87
-rw-r--r--arch/x86/kernel/genapic_flat_64.c62
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c80
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_32.S42
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c43
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic_32.c53
-rw-r--r--arch/x86/kernel/io_apic_64.c664
-rw-r--r--arch/x86/kernel/io_delay.c8
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c3
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit_32.c49
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ldt.c16
-rw-r--r--arch/x86/kernel/machine_kexec_32.c57
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c860
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c19
-rw-r--r--arch/x86/kernel/msr.c40
-rw-r--r--arch/x86/kernel/nmi.c39
-rw-r--r--arch/x86/kernel/numaq_32.c9
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c30
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c107
-rw-r--r--arch/x86/kernel/pci-dma.c310
-rw-r--r--arch/x86/kernel/pci-gart_64.c176
-rw-r--r--arch/x86/kernel/pci-nommu.c18
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c102
-rw-r--r--arch/x86/kernel/process_64.c193
-rw-r--r--arch/x86/kernel/ptrace.c522
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/setup.c256
-rw-r--r--arch/x86/kernel/setup_percpu.c30
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c273
-rw-r--r--arch/x86/kernel/signal_64.c353
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c187
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps_32.c5
-rw-r--r--arch/x86/kernel/traps_64.c81
-rw-r--r--arch/x86/kernel/tsc.c424
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/visws_quirks.c22
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c19
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S17
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/mmu.c111
-rw-r--r--arch/x86/kvm/paging_tmpl.h14
-rw-r--r--arch/x86/kvm/svm.c22
-rw-r--r--arch/x86/kvm/vmx.c21
-rw-r--r--arch/x86/kvm/vmx.h17
-rw-r--r--arch/x86/kvm/x86.c133
-rw-r--r--arch/x86/lguest/boot.c41
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S3
-rw-r--r--arch/x86/lib/msr-on-cpu.c76
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/mach-default/setup.c19
-rw-r--r--arch/x86/mach-es7000/Makefile5
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile1
-rw-r--r--arch/x86/mach-generic/bigsmp.c9
-rw-r--r--arch/x86/mach-generic/es7000.c13
-rw-r--r--arch/x86/mach-generic/numaq.c12
-rw-r--r--arch/x86/mach-generic/summit.c11
-rw-r--r--arch/x86/mach-rdc321x/platform.c1
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/discontig_32.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c17
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/init_32.c92
-rw-r--r--arch/x86/mm/init_64.c242
-rw-r--r--arch/x86/mm/ioremap.c68
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c12
-rw-r--r--arch/x86/mm/pageattr.c490
-rw-r--r--arch/x86/mm/pat.c182
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/mm/pgtable_32.c50
-rw-r--r--arch/x86/mm/srat_32.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c43
-rw-r--r--arch/x86/oprofile/op_model_p4.c175
-rw-r--r--arch/x86/pci/acpi.c5
-rw-r--r--arch/x86/pci/amd_bus.c52
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c26
-rw-r--r--arch/x86/pci/irq.c167
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c79
-rw-r--r--arch/x86/pci/numaq_32.c5
-rw-r--r--arch/x86/power/cpu_32.c13
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/power/hibernate_asm_32.S40
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c319
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/mmu.c314
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/setup.c2
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
233 files changed, 14717 insertions, 7304 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e3cba0b45600..fc8351f374fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,12 +23,14 @@ config X86
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_IOREMAP_PROT 24 select HAVE_IOREMAP_PROT
25 select HAVE_KPROBES 25 select HAVE_KPROBES
26 select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X 26 select ARCH_WANT_OPTIONAL_GPIOLIB
27 select HAVE_KRETPROBES 27 select HAVE_KRETPROBES
28 select HAVE_DYNAMIC_FTRACE 28 select HAVE_DYNAMIC_FTRACE
29 select HAVE_FTRACE 29 select HAVE_FTRACE
30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
31 select HAVE_ARCH_KGDB if !X86_VOYAGER 31 select HAVE_ARCH_KGDB if !X86_VOYAGER
32 select HAVE_ARCH_TRACEHOOK
33 select HAVE_GENERIC_DMA_COHERENT if X86_32
32 select HAVE_EFFICIENT_UNALIGNED_ACCESS 34 select HAVE_EFFICIENT_UNALIGNED_ACCESS
33 35
34config ARCH_DEFCONFIG 36config ARCH_DEFCONFIG
@@ -332,20 +334,6 @@ config X86_BIGSMP
332 334
333endif 335endif
334 336
335config X86_RDC321X
336 bool "RDC R-321x SoC"
337 depends on X86_32
338 select M486
339 select X86_REBOOTFIXUPS
340 select GENERIC_GPIO
341 select LEDS_CLASS
342 select LEDS_GPIO
343 select NEW_LEDS
344 help
345 This option is needed for RDC R-321x system-on-chip, also known
346 as R-8610-(G).
347 If you don't have one of these chips, you should say N here.
348
349config X86_VSMP 337config X86_VSMP
350 bool "Support for ScaleMP vSMP" 338 bool "Support for ScaleMP vSMP"
351 select PARAVIRT 339 select PARAVIRT
@@ -369,6 +357,16 @@ config X86_VISWS
369 A kernel compiled for the Visual Workstation will run on general 357 A kernel compiled for the Visual Workstation will run on general
370 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 358 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
371 359
360config X86_RDC321X
361 bool "RDC R-321x SoC"
362 depends on X86_32
363 select M486
364 select X86_REBOOTFIXUPS
365 help
366 This option is needed for RDC R-321x system-on-chip, also known
367 as R-8610-(G).
368 If you don't have one of these chips, you should say N here.
369
372config SCHED_NO_NO_OMIT_FRAME_POINTER 370config SCHED_NO_NO_OMIT_FRAME_POINTER
373 def_bool y 371 def_bool y
374 prompt "Single-depth WCHAN output" 372 prompt "Single-depth WCHAN output"
@@ -556,6 +554,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
556config AMD_IOMMU 554config AMD_IOMMU
557 bool "AMD IOMMU support" 555 bool "AMD IOMMU support"
558 select SWIOTLB 556 select SWIOTLB
557 select PCI_MSI
559 depends on X86_64 && PCI && ACPI 558 depends on X86_64 && PCI && ACPI
560 help 559 help
561 With this option you can enable support for AMD IOMMU hardware in 560 With this option you can enable support for AMD IOMMU hardware in
@@ -580,35 +579,29 @@ config SWIOTLB
580 579
581config IOMMU_HELPER 580config IOMMU_HELPER
582 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) 581 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
582
583config MAXSMP 583config MAXSMP
584 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 584 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
585 depends on X86_64 && SMP 585 depends on X86_64 && SMP && BROKEN
586 default n 586 default n
587 help 587 help
588 Configure maximum number of CPUS and NUMA Nodes for this architecture. 588 Configure maximum number of CPUS and NUMA Nodes for this architecture.
589 If unsure, say N. 589 If unsure, say N.
590 590
591if MAXSMP
592config NR_CPUS 591config NR_CPUS
593 int 592 int "Maximum number of CPUs (2-512)" if !MAXSMP
594 default "4096" 593 range 2 512
595endif
596
597if !MAXSMP
598config NR_CPUS
599 int "Maximum number of CPUs (2-4096)"
600 range 2 4096
601 depends on SMP 594 depends on SMP
595 default "4096" if MAXSMP
602 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 596 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
603 default "8" 597 default "8"
604 help 598 help
605 This allows you to specify the maximum number of CPUs which this 599 This allows you to specify the maximum number of CPUs which this
606 kernel will support. The maximum supported value is 4096 and the 600 kernel will support. The maximum supported value is 512 and the
607 minimum value which makes sense is 2. 601 minimum value which makes sense is 2.
608 602
609 This is purely to save memory - each supported CPU adds 603 This is purely to save memory - each supported CPU adds
610 approximately eight kilobytes to the kernel image. 604 approximately eight kilobytes to the kernel image.
611endif
612 605
613config SCHED_SMT 606config SCHED_SMT
614 bool "SMT (Hyperthreading) scheduler support" 607 bool "SMT (Hyperthreading) scheduler support"
@@ -785,23 +778,45 @@ config X86_REBOOTFIXUPS
785 Say N otherwise. 778 Say N otherwise.
786 779
787config MICROCODE 780config MICROCODE
788 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 781 tristate "/dev/cpu/microcode - microcode support"
789 select FW_LOADER 782 select FW_LOADER
790 ---help--- 783 ---help---
791 If you say Y here, you will be able to update the microcode on 784 If you say Y here, you will be able to update the microcode on
792 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 785 certain Intel and AMD processors. The Intel support is for the
793 Pentium III, Pentium 4, Xeon etc. You will obviously need the 786 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
794 actual microcode binary data itself which is not shipped with the 787 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
795 Linux kernel. 788 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
789 You will obviously need the actual microcode binary data itself
790 which is not shipped with the Linux kernel.
796 791
797 For latest news and information on obtaining all the required 792 This option selects the general module only, you need to select
798 ingredients for this driver, check: 793 at least one vendor specific module as well.
799 <http://www.urbanmyth.org/microcode/>.
800 794
801 To compile this driver as a module, choose M here: the 795 To compile this driver as a module, choose M here: the
802 module will be called microcode. 796 module will be called microcode.
803 797
804config MICROCODE_OLD_INTERFACE 798config MICROCODE_INTEL
799 bool "Intel microcode patch loading support"
800 depends on MICROCODE
801 default MICROCODE
802 select FW_LOADER
803 --help---
804 This options enables microcode patch loading support for Intel
805 processors.
806
807 For latest news and information on obtaining all the required
808 Intel ingredients for this driver, check:
809 <http://www.urbanmyth.org/microcode/>.
810
811config MICROCODE_AMD
812 bool "AMD microcode patch loading support"
813 depends on MICROCODE
814 select FW_LOADER
815 --help---
816 If you select this option, microcode patch loading support for AMD
817 processors will be enabled.
818
819 config MICROCODE_OLD_INTERFACE
805 def_bool y 820 def_bool y
806 depends on MICROCODE 821 depends on MICROCODE
807 822
@@ -954,9 +969,9 @@ config NUMA
954 local memory controller of the CPU and add some more 969 local memory controller of the CPU and add some more
955 NUMA awareness to the kernel. 970 NUMA awareness to the kernel.
956 971
957 For i386 this is currently highly experimental and should be only 972 For 32-bit this is currently highly experimental and should be only
958 used for kernel development. It might also cause boot failures. 973 used for kernel development. It might also cause boot failures.
959 For x86_64 this is recommended on all multiprocessor Opteron systems. 974 For 64-bit this is recommended on all multiprocessor Opteron systems.
960 If the system is EM64T, you should say N unless your system is 975 If the system is EM64T, you should say N unless your system is
961 EM64T NUMA. 976 EM64T NUMA.
962 977
@@ -999,17 +1014,10 @@ config NUMA_EMU
999 into virtual nodes when booted with "numa=fake=N", where N is the 1014 into virtual nodes when booted with "numa=fake=N", where N is the
1000 number of nodes. This is only useful for debugging. 1015 number of nodes. This is only useful for debugging.
1001 1016
1002if MAXSMP
1003
1004config NODES_SHIFT 1017config NODES_SHIFT
1005 int 1018 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1006 default "9"
1007endif
1008
1009if !MAXSMP
1010config NODES_SHIFT
1011 int "Maximum NUMA Nodes (as a power of 2)"
1012 range 1 9 if X86_64 1019 range 1 9 if X86_64
1020 default "9" if MAXSMP
1013 default "6" if X86_64 1021 default "6" if X86_64
1014 default "4" if X86_NUMAQ 1022 default "4" if X86_NUMAQ
1015 default "3" 1023 default "3"
@@ -1017,7 +1025,6 @@ config NODES_SHIFT
1017 help 1025 help
1018 Specify the maximum number of NUMA Nodes available on the target 1026 Specify the maximum number of NUMA Nodes available on the target
1019 system. Increases memory reserved to accomodate various tables. 1027 system. Increases memory reserved to accomodate various tables.
1020endif
1021 1028
1022config HAVE_ARCH_BOOTMEM_NODE 1029config HAVE_ARCH_BOOTMEM_NODE
1023 def_bool y 1030 def_bool y
@@ -1037,7 +1044,7 @@ config HAVE_ARCH_ALLOC_REMAP
1037 1044
1038config ARCH_FLATMEM_ENABLE 1045config ARCH_FLATMEM_ENABLE
1039 def_bool y 1046 def_bool y
1040 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA 1047 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
1041 1048
1042config ARCH_DISCONTIGMEM_ENABLE 1049config ARCH_DISCONTIGMEM_ENABLE
1043 def_bool y 1050 def_bool y
@@ -1053,7 +1060,7 @@ config ARCH_SPARSEMEM_DEFAULT
1053 1060
1054config ARCH_SPARSEMEM_ENABLE 1061config ARCH_SPARSEMEM_ENABLE
1055 def_bool y 1062 def_bool y
1056 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) 1063 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
1057 select SPARSEMEM_STATIC if X86_32 1064 select SPARSEMEM_STATIC if X86_32
1058 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1065 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1059 1066
@@ -1076,6 +1083,56 @@ config HIGHPTE
1076 low memory. Setting this option will put user-space page table 1083 low memory. Setting this option will put user-space page table
1077 entries in high memory. 1084 entries in high memory.
1078 1085
1086config X86_CHECK_BIOS_CORRUPTION
1087 bool "Check for low memory corruption"
1088 help
1089 Periodically check for memory corruption in low memory, which
1090 is suspected to be caused by BIOS. Even when enabled in the
1091 configuration, it is disabled at runtime. Enable it by
1092 setting "memory_corruption_check=1" on the kernel command
1093 line. By default it scans the low 64k of memory every 60
1094 seconds; see the memory_corruption_check_size and
1095 memory_corruption_check_period parameters in
1096 Documentation/kernel-parameters.txt to adjust this.
1097
1098 When enabled with the default parameters, this option has
1099 almost no overhead, as it reserves a relatively small amount
1100 of memory and scans it infrequently. It both detects corruption
1101 and prevents it from affecting the running system.
1102
1103 It is, however, intended as a diagnostic tool; if repeatable
1104 BIOS-originated corruption always affects the same memory,
1105 you can use memmap= to prevent the kernel from using that
1106 memory.
1107
1108config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1109 bool "Set the default setting of memory_corruption_check"
1110 depends on X86_CHECK_BIOS_CORRUPTION
1111 default y
1112 help
1113 Set whether the default state of memory_corruption_check is
1114 on or off.
1115
1116config X86_RESERVE_LOW_64K
1117 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1118 default y
1119 help
1120 Reserve the first 64K of physical RAM on BIOSes that are known
1121 to potentially corrupt that memory range. A numbers of BIOSes are
1122 known to utilize this area during suspend/resume, so it must not
1123 be used by the kernel.
1124
1125 Set this to N if you are absolutely sure that you trust the BIOS
1126 to get all its memory reservations and usages right.
1127
1128 If you have doubts about the BIOS (e.g. suspend/resume does not
1129 work or there's kernel crashes after certain hardware hotplug
1130 events) and it's not AMI or Phoenix, then you might want to enable
1131 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1132 corruption patterns.
1133
1134 Say Y if unsure.
1135
1079config MATH_EMULATION 1136config MATH_EMULATION
1080 bool 1137 bool
1081 prompt "Math emulation" if X86_32 1138 prompt "Math emulation" if X86_32
@@ -1134,10 +1191,10 @@ config MTRR
1134 You can safely say Y even if your machine doesn't have MTRRs, you'll 1191 You can safely say Y even if your machine doesn't have MTRRs, you'll
1135 just add about 9 KB to your kernel. 1192 just add about 9 KB to your kernel.
1136 1193
1137 See <file:Documentation/mtrr.txt> for more information. 1194 See <file:Documentation/x86/mtrr.txt> for more information.
1138 1195
1139config MTRR_SANITIZER 1196config MTRR_SANITIZER
1140 bool 1197 def_bool y
1141 prompt "MTRR cleanup support" 1198 prompt "MTRR cleanup support"
1142 depends on MTRR 1199 depends on MTRR
1143 help 1200 help
@@ -1148,7 +1205,7 @@ config MTRR_SANITIZER
1148 The largest mtrr entry size for a continous block can be set with 1205 The largest mtrr entry size for a continous block can be set with
1149 mtrr_chunk_size. 1206 mtrr_chunk_size.
1150 1207
1151 If unsure, say N. 1208 If unsure, say Y.
1152 1209
1153config MTRR_SANITIZER_ENABLE_DEFAULT 1210config MTRR_SANITIZER_ENABLE_DEFAULT
1154 int "MTRR cleanup enable value (0-1)" 1211 int "MTRR cleanup enable value (0-1)"
@@ -1208,7 +1265,6 @@ config IRQBALANCE
1208config SECCOMP 1265config SECCOMP
1209 def_bool y 1266 def_bool y
1210 prompt "Enable seccomp to safely compute untrusted bytecode" 1267 prompt "Enable seccomp to safely compute untrusted bytecode"
1211 depends on PROC_FS
1212 help 1268 help
1213 This kernel feature is useful for number crunching applications 1269 This kernel feature is useful for number crunching applications
1214 that may need to compute untrusted bytecode during their 1270 that may need to compute untrusted bytecode during their
@@ -1216,7 +1272,7 @@ config SECCOMP
1216 the process as file descriptors supporting the read/write 1272 the process as file descriptors supporting the read/write
1217 syscalls, it's possible to isolate those applications in 1273 syscalls, it's possible to isolate those applications in
1218 their own address space using seccomp. Once seccomp is 1274 their own address space using seccomp. Once seccomp is
1219 enabled via /proc/<pid>/seccomp, it cannot be disabled 1275 enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
1220 and the task is only allowed to execute a few safe syscalls 1276 and the task is only allowed to execute a few safe syscalls
1221 defined by each seccomp mode. 1277 defined by each seccomp mode.
1222 1278
@@ -1266,7 +1322,7 @@ config KEXEC
1266 strongly in flux, so no good recommendation can be made. 1322 strongly in flux, so no good recommendation can be made.
1267 1323
1268config CRASH_DUMP 1324config CRASH_DUMP
1269 bool "kernel crash dumps (EXPERIMENTAL)" 1325 bool "kernel crash dumps"
1270 depends on X86_64 || (X86_32 && HIGHMEM) 1326 depends on X86_64 || (X86_32 && HIGHMEM)
1271 help 1327 help
1272 Generate crash dump after being started by kexec. 1328 Generate crash dump after being started by kexec.
@@ -1279,6 +1335,14 @@ config CRASH_DUMP
1279 (CONFIG_RELOCATABLE=y). 1335 (CONFIG_RELOCATABLE=y).
1280 For more details see Documentation/kdump/kdump.txt 1336 For more details see Documentation/kdump/kdump.txt
1281 1337
1338config KEXEC_JUMP
1339 bool "kexec jump (EXPERIMENTAL)"
1340 depends on EXPERIMENTAL
1341 depends on KEXEC && HIBERNATION && X86_32
1342 help
1343 Jump between original kernel and kexeced kernel and invoke
1344 code in physical address mode via KEXEC
1345
1282config PHYSICAL_START 1346config PHYSICAL_START
1283 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1347 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1284 default "0x1000000" if X86_NUMAQ 1348 default "0x1000000" if X86_NUMAQ
@@ -1365,14 +1429,14 @@ config PHYSICAL_ALIGN
1365 Don't change this unless you know what you are doing. 1429 Don't change this unless you know what you are doing.
1366 1430
1367config HOTPLUG_CPU 1431config HOTPLUG_CPU
1368 bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" 1432 bool "Support for hot-pluggable CPUs"
1369 depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER 1433 depends on SMP && HOTPLUG && !X86_VOYAGER
1370 ---help--- 1434 ---help---
1371 Say Y here to experiment with turning CPUs off and on, and to 1435 Say Y here to allow turning CPUs off and on. CPUs can be
1372 enable suspend on SMP systems. CPUs can be controlled through 1436 controlled through /sys/devices/system/cpu.
1373 /sys/devices/system/cpu. 1437 ( Note: power management support will enable this option
1374 Say N if you want to disable CPU hotplug and don't need to 1438 automatically on SMP systems. )
1375 suspend. 1439 Say N if you want to disable CPU hotplug.
1376 1440
1377config COMPAT_VDSO 1441config COMPAT_VDSO
1378 def_bool y 1442 def_bool y
@@ -1387,6 +1451,51 @@ config COMPAT_VDSO
1387 1451
1388 If unsure, say Y. 1452 If unsure, say Y.
1389 1453
1454config CMDLINE_BOOL
1455 bool "Built-in kernel command line"
1456 default n
1457 help
1458 Allow for specifying boot arguments to the kernel at
1459 build time. On some systems (e.g. embedded ones), it is
1460 necessary or convenient to provide some or all of the
1461 kernel boot arguments with the kernel itself (that is,
1462 to not rely on the boot loader to provide them.)
1463
1464 To compile command line arguments into the kernel,
1465 set this option to 'Y', then fill in the
1466 the boot arguments in CONFIG_CMDLINE.
1467
1468 Systems with fully functional boot loaders (i.e. non-embedded)
1469 should leave this option set to 'N'.
1470
1471config CMDLINE
1472 string "Built-in kernel command string"
1473 depends on CMDLINE_BOOL
1474 default ""
1475 help
1476 Enter arguments here that should be compiled into the kernel
1477 image and used at boot time. If the boot loader provides a
1478 command line at boot time, it is appended to this string to
1479 form the full kernel command line, when the system boots.
1480
1481 However, you can use the CONFIG_CMDLINE_OVERRIDE option to
1482 change this behavior.
1483
1484 In most cases, the command line (whether built-in or provided
1485 by the boot loader) should specify the device for the root
1486 file system.
1487
1488config CMDLINE_OVERRIDE
1489 bool "Built-in command line overrides boot loader arguments"
1490 default n
1491 depends on CMDLINE_BOOL
1492 help
1493 Set this option to 'Y' to have the kernel ignore the boot loader
1494 command line, and use ONLY the built-in command line.
1495
1496 This is used to work around broken boot loaders. This should
1497 be set to 'N' under normal conditions.
1498
1390endmenu 1499endmenu
1391 1500
1392config ARCH_ENABLE_MEMORY_HOTPLUG 1501config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1652,6 +1761,14 @@ config DMAR_FLOPPY_WA
1652 workaround will setup a 1:1 mapping for the first 1761 workaround will setup a 1:1 mapping for the first
1653 16M to make floppy (an ISA device) work. 1762 16M to make floppy (an ISA device) work.
1654 1763
1764config INTR_REMAP
1765 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1766 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1767 help
1768 Supports Interrupt remapping for IO-APIC and MSI devices.
1769 To use x2apic mode in the CPU's which support x2APIC enhancements or
1770 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1771
1655source "drivers/pci/pcie/Kconfig" 1772source "drivers/pci/pcie/Kconfig"
1656 1773
1657source "drivers/pci/Kconfig" 1774source "drivers/pci/Kconfig"
@@ -1782,7 +1899,7 @@ config COMPAT_FOR_U64_ALIGNMENT
1782 1899
1783config SYSVIPC_COMPAT 1900config SYSVIPC_COMPAT
1784 def_bool y 1901 def_bool y
1785 depends on X86_64 && COMPAT && SYSVIPC 1902 depends on COMPAT && SYSVIPC
1786 1903
1787endmenu 1904endmenu
1788 1905
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2c518fbc52ec..c5f101360520 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -382,14 +382,17 @@ config X86_OOSTORE
382# P6_NOPs are a relatively minor optimization that require a family >= 382# P6_NOPs are a relatively minor optimization that require a family >=
383# 6 processor, except that it is broken on certain VIA chips. 383# 6 processor, except that it is broken on certain VIA chips.
384# Furthermore, AMD chips prefer a totally different sequence of NOPs 384# Furthermore, AMD chips prefer a totally different sequence of NOPs
385# (which work on all CPUs). As a result, disallow these if we're 385# (which work on all CPUs). In addition, it looks like Virtual PC
386# compiling X86_GENERIC but not X86_64 (these NOPs do work on all 386# does not understand them.
387# x86-64 capable chips); the list of processors in the right-hand clause 387#
388# are the cores that benefit from this optimization. 388# As a result, disallow these if we're not compiling for X86_64 (these
389# NOPs do work on all x86-64 capable chips); the list of processors in
390# the right-hand clause are the cores that benefit from this optimization.
389# 391#
390config X86_P6_NOP 392config X86_P6_NOP
391 def_bool y 393 def_bool y
392 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) 394 depends on X86_64
395 depends on (MCORE2 || MPENTIUM4 || MPSC)
393 396
394config X86_TSC 397config X86_TSC
395 def_bool y 398 def_bool y
@@ -415,3 +418,123 @@ config X86_MINIMUM_CPU_FAMILY
415config X86_DEBUGCTLMSR 418config X86_DEBUGCTLMSR
416 def_bool y 419 def_bool y
417 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
421
422menuconfig PROCESSOR_SELECT
423 bool "Supported processor vendors" if EMBEDDED
424 help
425 This lets you choose what x86 vendor support code your kernel
426 will include.
427
428config CPU_SUP_INTEL
429 default y
430 bool "Support Intel processors" if PROCESSOR_SELECT
431 help
432 This enables detection, tunings and quirks for Intel processors
433
434 You need this enabled if you want your kernel to run on an
435 Intel CPU. Disabling this option on other types of CPUs
436 makes the kernel a tiny bit smaller. Disabling it on an Intel
437 CPU might render the kernel unbootable.
438
439 If unsure, say N.
440
441config CPU_SUP_CYRIX_32
442 default y
443 bool "Support Cyrix processors" if PROCESSOR_SELECT
444 depends on !64BIT
445 help
446 This enables detection, tunings and quirks for Cyrix processors
447
448 You need this enabled if you want your kernel to run on a
449 Cyrix CPU. Disabling this option on other types of CPUs
450 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
451 CPU might render the kernel unbootable.
452
453 If unsure, say N.
454
455config CPU_SUP_AMD
456 default y
457 bool "Support AMD processors" if PROCESSOR_SELECT
458 help
459 This enables detection, tunings and quirks for AMD processors
460
461 You need this enabled if you want your kernel to run on an
462 AMD CPU. Disabling this option on other types of CPUs
463 makes the kernel a tiny bit smaller. Disabling it on an AMD
464 CPU might render the kernel unbootable.
465
466 If unsure, say N.
467
468config CPU_SUP_CENTAUR_32
469 default y
470 bool "Support Centaur processors" if PROCESSOR_SELECT
471 depends on !64BIT
472 help
473 This enables detection, tunings and quirks for Centaur processors
474
475 You need this enabled if you want your kernel to run on a
476 Centaur CPU. Disabling this option on other types of CPUs
477 makes the kernel a tiny bit smaller. Disabling it on a Centaur
478 CPU might render the kernel unbootable.
479
480 If unsure, say N.
481
482config CPU_SUP_CENTAUR_64
483 default y
484 bool "Support Centaur processors" if PROCESSOR_SELECT
485 depends on 64BIT
486 help
487 This enables detection, tunings and quirks for Centaur processors
488
489 You need this enabled if you want your kernel to run on a
490 Centaur CPU. Disabling this option on other types of CPUs
491 makes the kernel a tiny bit smaller. Disabling it on a Centaur
492 CPU might render the kernel unbootable.
493
494 If unsure, say N.
495
496config CPU_SUP_TRANSMETA_32
497 default y
498 bool "Support Transmeta processors" if PROCESSOR_SELECT
499 depends on !64BIT
500 help
501 This enables detection, tunings and quirks for Transmeta processors
502
503 You need this enabled if you want your kernel to run on a
504 Transmeta CPU. Disabling this option on other types of CPUs
505 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
506 CPU might render the kernel unbootable.
507
508 If unsure, say N.
509
510config CPU_SUP_UMC_32
511 default y
512 bool "Support UMC processors" if PROCESSOR_SELECT
513 depends on !64BIT
514 help
515 This enables detection, tunings and quirks for UMC processors
516
517 You need this enabled if you want your kernel to run on a
518 UMC CPU. Disabling this option on other types of CPUs
519 makes the kernel a tiny bit smaller. Disabling it on a UMC
520 CPU might render the kernel unbootable.
521
522 If unsure, say N.
523
524config X86_DS
525 bool "Debug Store support"
526 default y
527 help
528 Add support for Debug Store.
529 This allows the kernel to provide a memory buffer to the hardware
530 to store various profiling and tracing events.
531
532config X86_PTRACE_BTS
533 bool "ptrace interface to Branch Trace Store"
534 default y
535 depends on (X86_DS && X86_DEBUGCTLMSR)
536 help
537 Add a ptrace interface to allow collecting an execution trace
538 of the traced task.
539 This collects control flow changes in a (cyclic) buffer and allows
540 debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 092f019e033a..2a3dfbd5e677 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,19 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
46config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
47 bool "Check for stack overflows" 60 bool "Check for stack overflows"
48 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea654..f5631da585b6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# RDC R-321x subarch support
122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
124core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
125
126# default subarch .h files 121# default subarch .h files
127mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iinclude/asm-x86/mach-default
128 123
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e919..b72b4f753113 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -45,3 +45,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 45# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 47
48# Bug fix for binutils: this option is required in order to keep
49# binutils from generating NOPL instructions against our will.
50ifneq ($(CONFIG_X86_P6_NOP),y)
51cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
52endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f8..cd48c7210016 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7cb..cc0ef13fba7a 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,10 +24,14 @@
24#include <linux/edd.h> 24#include <linux/edd.h>
25#include <asm/boot.h> 25#include <asm/boot.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include "bitops.h"
28#include <asm/cpufeature.h>
27 29
28/* Useful macros */ 30/* Useful macros */
29#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
30 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
34
31extern struct setup_header hdr; 35extern struct setup_header hdr;
32extern struct boot_params boot_params; 36extern struct boot_params boot_params;
33 37
@@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
242int cmdline_find_option_bool(const char *option); 246int cmdline_find_option_bool(const char *option);
243 247
244/* cpu.c, cpucheck.c */ 248/* cpu.c, cpucheck.c */
249struct cpu_features {
250 int level; /* Family, or 64 for x86-64 */
251 int model;
252 u32 flags[NCAPINTS];
253};
254extern struct cpu_features cpu;
245int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 255int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
246int validate_cpu(void); 256int validate_cpu(void);
247 257
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93e..1771c804e02f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec7..29c5fbf08392 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
137 */ 137 */
138 movl output_len(%ebx), %eax 138 movl output_len(%ebx), %eax
139 pushl %eax 139 pushl %eax
140 # push arguments for decompress_kernel:
140 pushl %ebp # output address 141 pushl %ebp # output address
141 movl input_len(%ebx), %eax 142 movl input_len(%ebx), %eax
142 pushl %eax # input_len 143 pushl %eax # input_len
143 leal input_data(%ebx), %eax 144 leal input_data(%ebx), %eax
144 pushl %eax # input_data 145 pushl %eax # input_data
145 leal boot_heap(%ebx), %eax 146 leal boot_heap(%ebx), %eax
146 pushl %eax # heap area as third argument 147 pushl %eax # heap area
147 pushl %esi # real mode pointer as second arg 148 pushl %esi # real mode pointer
148 call decompress_kernel 149 call decompress_kernel
149 addl $20, %esp 150 addl $20, %esp
150 popl %ecx 151 popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 9fea73706479..5780d361105b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
16 */ 16 */
17#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
19#define _ASM_DESC_H_ 1 19#define ASM_X86__DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/screen_info.h> 28#include <linux/screen_info.h>
29#include <linux/elf.h> 29#include <linux/elf.h>
30#include <asm/io.h> 30#include <linux/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h> 33#include <asm/bootparam.h>
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
251 y--; 251 y--;
252 } 252 }
253 } else { 253 } else {
254 vidmem [(x + cols * y) * 2] = c; 254 vidmem[(x + cols * y) * 2] = c;
255 if (++x >= cols) { 255 if (++x >= cols) {
256 x = 0; 256 x = 0;
257 if (++y >= lines) { 257 if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
277 int i; 277 int i;
278 char *ss = s; 278 char *ss = s;
279 279
280 for (i = 0; i < n; i++) ss[i] = c; 280 for (i = 0; i < n; i++)
281 ss[i] = c;
281 return s; 282 return s;
282} 283}
283 284
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
287 const char *s = src; 288 const char *s = src;
288 char *d = dest; 289 char *d = dest;
289 290
290 for (i = 0; i < n; i++) d[i] = s[i]; 291 for (i = 0; i < n; i++)
292 d[i] = s[i];
291 return dest; 293 return dest;
292} 294}
293 295
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0c..857e492c571e 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
492 continue; 492 continue;
493 } 493 }
494 sh_symtab = sec_symtab->symtab; 494 sh_symtab = sec_symtab->symtab;
495 sym_strtab = sec->link->strtab; 495 sym_strtab = sec_symtab->link->strtab;
496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
497 Elf32_Rel *rel; 497 Elf32_Rel *rel;
498 Elf32_Sym *sym; 498 Elf32_Sym *sym;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 92d6fd73dc7d..6ec6bb6e9957 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
16 */ 16 */
17 17
18#include "boot.h" 18#include "boot.h"
19#include "bitops.h"
20#include <asm/cpufeature.h>
21
22#include "cpustr.h" 19#include "cpustr.h"
23 20
24static char *cpu_name(int level) 21static char *cpu_name(int level)
@@ -62,17 +59,18 @@ int validate_cpu(void)
62 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
63 60
64 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
65 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
66 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
67 /* Skip to the next string */ 64 /* Skip to the next string */
68 do { 65 msg_strs += 2;
69 msg_strs++; 66 while (*msg_strs++)
70 } while (*msg_strs); 67 ;
71 msg_strs++;
72 } 68 }
73 if (e & 1) { 69 if (e & 1) {
74 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
75 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
76 else 74 else
77 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
78 } 76 }
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee005..4d3ff037201f 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,21 +22,13 @@
22 22
23#ifdef _SETUP 23#ifdef _SETUP
24# include "boot.h" 24# include "boot.h"
25# include "bitops.h"
26#endif 25#endif
27#include <linux/types.h> 26#include <linux/types.h>
28#include <asm/cpufeature.h>
29#include <asm/processor-flags.h> 27#include <asm/processor-flags.h>
30#include <asm/required-features.h> 28#include <asm/required-features.h>
31#include <asm/msr-index.h> 29#include <asm/msr-index.h>
32 30
33struct cpu_features { 31struct cpu_features cpu;
34 int level; /* Family, or 64 for x86-64 */
35 int model;
36 u32 flags[NCAPINTS];
37};
38
39static struct cpu_features cpu;
40static u32 cpu_vendor[3]; 32static u32 cpu_vendor[3];
41static u32 err_flags[NCAPINTS]; 33static u32 err_flags[NCAPINTS];
42 34
@@ -46,12 +38,12 @@ static const u32 req_flags[NCAPINTS] =
46{ 38{
47 REQUIRED_MASK0, 39 REQUIRED_MASK0,
48 REQUIRED_MASK1, 40 REQUIRED_MASK1,
49 REQUIRED_MASK2, 41 0, /* REQUIRED_MASK2 not implemented in this file */
50 REQUIRED_MASK3, 42 0, /* REQUIRED_MASK3 not implemented in this file */
51 REQUIRED_MASK4, 43 REQUIRED_MASK4,
52 REQUIRED_MASK5, 44 0, /* REQUIRED_MASK5 not implemented in this file */
53 REQUIRED_MASK6, 45 REQUIRED_MASK6,
54 REQUIRED_MASK7, 46 0, /* REQUIRED_MASK7 not implemented in this file */
55}; 47};
56 48
57#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) 49#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index d93cbc6464d0..1aae8f3e5ca1 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acfa..b993062e9a5f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ 30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */ 31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ 32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
34 33
35#ifndef SVGA_MODE 34#ifndef SVGA_MODE
36#define SVGA_MODE ASK_VGA 35#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 2296164b54d2..197421db1af1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,11 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6)
79 return;
80
76 asm("int $0x15" 81 asm("int $0x15"
77 : "=a" (boot_params.ist_info.signature), 82 : "=a" (boot_params.ist_info.signature),
78 "=b" (boot_params.ist_info.command), 83 "=b" (boot_params.ist_info.command),
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 53165c97336b..8c3c25f35578 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,7 +13,6 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/kernel.h>
17 16
18#define SMAP 0x534d4150 /* ASCII "SMAP" */ 17#define SMAP 0x534d4150 /* ASCII "SMAP" */
19 18
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae9..8ef60f20b371 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
15 15
16#include <stdio.h> 16#include <stdio.h>
17 17
18#include "../kernel/cpu/feature_names.c" 18#include "../kernel/cpu/capflags.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23 19
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad08..1e6fe0214c85 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -224,7 +224,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 224static void vesa_store_mode_params_graphics(void)
225{ 225{
226 /* Tell the kernel we're in VESA graphics mode */ 226 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 227 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 228
229 /* Mode parameters */ 229 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 4d73f53287b6..ca226ca31288 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc5
4# Sun May 4 19:59:02 2008 4# Wed Sep 3 17:23:09 2008
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,27 +134,35 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147CONFIG_HAVE_GENERIC_DMA_COHERENT=y
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152# CONFIG_LBD is not set 161# CONFIG_LBD is not set
153CONFIG_BLK_DEV_IO_TRACE=y 162CONFIG_BLK_DEV_IO_TRACE=y
154# CONFIG_LSF is not set 163# CONFIG_LSF is not set
155CONFIG_BLK_DEV_BSG=y 164CONFIG_BLK_DEV_BSG=y
165# CONFIG_BLK_DEV_INTEGRITY is not set
156 166
157# 167#
158# IO Schedulers 168# IO Schedulers
@@ -176,25 +186,23 @@ CONFIG_NO_HZ=y
176CONFIG_HIGH_RES_TIMERS=y 186CONFIG_HIGH_RES_TIMERS=y
177CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 187CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
178CONFIG_SMP=y 188CONFIG_SMP=y
189CONFIG_X86_FIND_SMP_CONFIG=y
190CONFIG_X86_MPPARSE=y
179CONFIG_X86_PC=y 191CONFIG_X86_PC=y
180# CONFIG_X86_ELAN is not set 192# CONFIG_X86_ELAN is not set
181# CONFIG_X86_VOYAGER is not set 193# CONFIG_X86_VOYAGER is not set
182# CONFIG_X86_NUMAQ is not set
183# CONFIG_X86_SUMMIT is not set
184# CONFIG_X86_BIGSMP is not set
185# CONFIG_X86_VISWS is not set
186# CONFIG_X86_GENERICARCH is not set 194# CONFIG_X86_GENERICARCH is not set
187# CONFIG_X86_ES7000 is not set
188# CONFIG_X86_RDC321X is not set
189# CONFIG_X86_VSMP is not set 195# CONFIG_X86_VSMP is not set
196# CONFIG_X86_RDC321X is not set
190CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y 197CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
191# CONFIG_PARAVIRT_GUEST is not set 198# CONFIG_PARAVIRT_GUEST is not set
199# CONFIG_MEMTEST is not set
192# CONFIG_M386 is not set 200# CONFIG_M386 is not set
193# CONFIG_M486 is not set 201# CONFIG_M486 is not set
194# CONFIG_M586 is not set 202# CONFIG_M586 is not set
195# CONFIG_M586TSC is not set 203# CONFIG_M586TSC is not set
196# CONFIG_M586MMX is not set 204# CONFIG_M586MMX is not set
197# CONFIG_M686 is not set 205CONFIG_M686=y
198# CONFIG_MPENTIUMII is not set 206# CONFIG_MPENTIUMII is not set
199# CONFIG_MPENTIUMIII is not set 207# CONFIG_MPENTIUMIII is not set
200# CONFIG_MPENTIUMM is not set 208# CONFIG_MPENTIUMM is not set
@@ -213,30 +221,30 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
213# CONFIG_MVIAC3_2 is not set 221# CONFIG_MVIAC3_2 is not set
214# CONFIG_MVIAC7 is not set 222# CONFIG_MVIAC7 is not set
215# CONFIG_MPSC is not set 223# CONFIG_MPSC is not set
216CONFIG_MCORE2=y 224# CONFIG_MCORE2 is not set
217# CONFIG_GENERIC_CPU is not set 225# CONFIG_GENERIC_CPU is not set
218# CONFIG_X86_GENERIC is not set 226CONFIG_X86_GENERIC=y
219CONFIG_X86_CPU=y 227CONFIG_X86_CPU=y
220CONFIG_X86_CMPXCHG=y 228CONFIG_X86_CMPXCHG=y
221CONFIG_X86_L1_CACHE_SHIFT=6 229CONFIG_X86_L1_CACHE_SHIFT=7
222CONFIG_X86_XADD=y 230CONFIG_X86_XADD=y
231# CONFIG_X86_PPRO_FENCE is not set
223CONFIG_X86_WP_WORKS_OK=y 232CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INVLPG=y 233CONFIG_X86_INVLPG=y
225CONFIG_X86_BSWAP=y 234CONFIG_X86_BSWAP=y
226CONFIG_X86_POPAD_OK=y 235CONFIG_X86_POPAD_OK=y
227CONFIG_X86_GOOD_APIC=y
228CONFIG_X86_INTEL_USERCOPY=y 236CONFIG_X86_INTEL_USERCOPY=y
229CONFIG_X86_USE_PPRO_CHECKSUM=y 237CONFIG_X86_USE_PPRO_CHECKSUM=y
230CONFIG_X86_P6_NOP=y
231CONFIG_X86_TSC=y 238CONFIG_X86_TSC=y
232CONFIG_X86_MINIMUM_CPU_FAMILY=6 239CONFIG_X86_CMOV=y
240CONFIG_X86_MINIMUM_CPU_FAMILY=4
233CONFIG_X86_DEBUGCTLMSR=y 241CONFIG_X86_DEBUGCTLMSR=y
234CONFIG_HPET_TIMER=y 242CONFIG_HPET_TIMER=y
235CONFIG_HPET_EMULATE_RTC=y 243CONFIG_HPET_EMULATE_RTC=y
236CONFIG_DMI=y 244CONFIG_DMI=y
237# CONFIG_IOMMU_HELPER is not set 245# CONFIG_IOMMU_HELPER is not set
238CONFIG_NR_CPUS=4 246CONFIG_NR_CPUS=64
239# CONFIG_SCHED_SMT is not set 247CONFIG_SCHED_SMT=y
240CONFIG_SCHED_MC=y 248CONFIG_SCHED_MC=y
241# CONFIG_PREEMPT_NONE is not set 249# CONFIG_PREEMPT_NONE is not set
242CONFIG_PREEMPT_VOLUNTARY=y 250CONFIG_PREEMPT_VOLUNTARY=y
@@ -247,8 +255,9 @@ CONFIG_X86_IO_APIC=y
247CONFIG_VM86=y 255CONFIG_VM86=y
248# CONFIG_TOSHIBA is not set 256# CONFIG_TOSHIBA is not set
249# CONFIG_I8K is not set 257# CONFIG_I8K is not set
250# CONFIG_X86_REBOOTFIXUPS is not set 258CONFIG_X86_REBOOTFIXUPS=y
251# CONFIG_MICROCODE is not set 259CONFIG_MICROCODE=y
260CONFIG_MICROCODE_OLD_INTERFACE=y
252CONFIG_X86_MSR=y 261CONFIG_X86_MSR=y
253CONFIG_X86_CPUID=y 262CONFIG_X86_CPUID=y
254# CONFIG_NOHIGHMEM is not set 263# CONFIG_NOHIGHMEM is not set
@@ -256,32 +265,28 @@ CONFIG_HIGHMEM4G=y
256# CONFIG_HIGHMEM64G is not set 265# CONFIG_HIGHMEM64G is not set
257CONFIG_PAGE_OFFSET=0xC0000000 266CONFIG_PAGE_OFFSET=0xC0000000
258CONFIG_HIGHMEM=y 267CONFIG_HIGHMEM=y
259CONFIG_NEED_NODE_MEMMAP_SIZE=y
260CONFIG_ARCH_FLATMEM_ENABLE=y 268CONFIG_ARCH_FLATMEM_ENABLE=y
261CONFIG_ARCH_SPARSEMEM_ENABLE=y 269CONFIG_ARCH_SPARSEMEM_ENABLE=y
262CONFIG_ARCH_SELECT_MEMORY_MODEL=y 270CONFIG_ARCH_SELECT_MEMORY_MODEL=y
263CONFIG_SELECT_MEMORY_MODEL=y 271CONFIG_SELECT_MEMORY_MODEL=y
264# CONFIG_FLATMEM_MANUAL is not set 272CONFIG_FLATMEM_MANUAL=y
265# CONFIG_DISCONTIGMEM_MANUAL is not set 273# CONFIG_DISCONTIGMEM_MANUAL is not set
266CONFIG_SPARSEMEM_MANUAL=y 274# CONFIG_SPARSEMEM_MANUAL is not set
267CONFIG_SPARSEMEM=y 275CONFIG_FLATMEM=y
268CONFIG_HAVE_MEMORY_PRESENT=y 276CONFIG_FLAT_NODE_MEM_MAP=y
269CONFIG_SPARSEMEM_STATIC=y 277CONFIG_SPARSEMEM_STATIC=y
270# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set 278# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
271
272#
273# Memory hotplug is currently incompatible with Software Suspend
274#
275CONFIG_PAGEFLAGS_EXTENDED=y 279CONFIG_PAGEFLAGS_EXTENDED=y
276CONFIG_SPLIT_PTLOCK_CPUS=4 280CONFIG_SPLIT_PTLOCK_CPUS=4
277CONFIG_RESOURCES_64BIT=y 281CONFIG_RESOURCES_64BIT=y
278CONFIG_ZONE_DMA_FLAG=1 282CONFIG_ZONE_DMA_FLAG=1
279CONFIG_BOUNCE=y 283CONFIG_BOUNCE=y
280CONFIG_VIRT_TO_BUS=y 284CONFIG_VIRT_TO_BUS=y
281# CONFIG_HIGHPTE is not set 285CONFIG_HIGHPTE=y
282# CONFIG_MATH_EMULATION is not set 286# CONFIG_MATH_EMULATION is not set
283CONFIG_MTRR=y 287CONFIG_MTRR=y
284# CONFIG_X86_PAT is not set 288# CONFIG_MTRR_SANITIZER is not set
289CONFIG_X86_PAT=y
285CONFIG_EFI=y 290CONFIG_EFI=y
286# CONFIG_IRQBALANCE is not set 291# CONFIG_IRQBALANCE is not set
287CONFIG_SECCOMP=y 292CONFIG_SECCOMP=y
@@ -293,6 +298,7 @@ CONFIG_HZ=1000
293CONFIG_SCHED_HRTICK=y 298CONFIG_SCHED_HRTICK=y
294CONFIG_KEXEC=y 299CONFIG_KEXEC=y
295CONFIG_CRASH_DUMP=y 300CONFIG_CRASH_DUMP=y
301# CONFIG_KEXEC_JUMP is not set
296CONFIG_PHYSICAL_START=0x1000000 302CONFIG_PHYSICAL_START=0x1000000
297CONFIG_RELOCATABLE=y 303CONFIG_RELOCATABLE=y
298CONFIG_PHYSICAL_ALIGN=0x200000 304CONFIG_PHYSICAL_ALIGN=0x200000
@@ -312,6 +318,7 @@ CONFIG_PM_TRACE_RTC=y
312CONFIG_PM_SLEEP_SMP=y 318CONFIG_PM_SLEEP_SMP=y
313CONFIG_PM_SLEEP=y 319CONFIG_PM_SLEEP=y
314CONFIG_SUSPEND=y 320CONFIG_SUSPEND=y
321# CONFIG_PM_TEST_SUSPEND is not set
315CONFIG_SUSPEND_FREEZER=y 322CONFIG_SUSPEND_FREEZER=y
316CONFIG_HIBERNATION=y 323CONFIG_HIBERNATION=y
317CONFIG_PM_STD_PARTITION="" 324CONFIG_PM_STD_PARTITION=""
@@ -337,6 +344,7 @@ CONFIG_ACPI_THERMAL=y
337CONFIG_ACPI_BLACKLIST_YEAR=0 344CONFIG_ACPI_BLACKLIST_YEAR=0
338# CONFIG_ACPI_DEBUG is not set 345# CONFIG_ACPI_DEBUG is not set
339CONFIG_ACPI_EC=y 346CONFIG_ACPI_EC=y
347# CONFIG_ACPI_PCI_SLOT is not set
340CONFIG_ACPI_POWER=y 348CONFIG_ACPI_POWER=y
341CONFIG_ACPI_SYSTEM=y 349CONFIG_ACPI_SYSTEM=y
342CONFIG_X86_PM_TIMER=y 350CONFIG_X86_PM_TIMER=y
@@ -395,8 +403,8 @@ CONFIG_PCI=y
395# CONFIG_PCI_GOBIOS is not set 403# CONFIG_PCI_GOBIOS is not set
396# CONFIG_PCI_GOMMCONFIG is not set 404# CONFIG_PCI_GOMMCONFIG is not set
397# CONFIG_PCI_GODIRECT is not set 405# CONFIG_PCI_GODIRECT is not set
398CONFIG_PCI_GOANY=y
399# CONFIG_PCI_GOOLPC is not set 406# CONFIG_PCI_GOOLPC is not set
407CONFIG_PCI_GOANY=y
400CONFIG_PCI_BIOS=y 408CONFIG_PCI_BIOS=y
401CONFIG_PCI_DIRECT=y 409CONFIG_PCI_DIRECT=y
402CONFIG_PCI_MMCONFIG=y 410CONFIG_PCI_MMCONFIG=y
@@ -448,10 +456,6 @@ CONFIG_HOTPLUG_PCI=y
448CONFIG_BINFMT_ELF=y 456CONFIG_BINFMT_ELF=y
449# CONFIG_BINFMT_AOUT is not set 457# CONFIG_BINFMT_AOUT is not set
450CONFIG_BINFMT_MISC=y 458CONFIG_BINFMT_MISC=y
451
452#
453# Networking
454#
455CONFIG_NET=y 459CONFIG_NET=y
456 460
457# 461#
@@ -475,7 +479,10 @@ CONFIG_IP_FIB_HASH=y
475CONFIG_IP_MULTIPLE_TABLES=y 479CONFIG_IP_MULTIPLE_TABLES=y
476CONFIG_IP_ROUTE_MULTIPATH=y 480CONFIG_IP_ROUTE_MULTIPATH=y
477CONFIG_IP_ROUTE_VERBOSE=y 481CONFIG_IP_ROUTE_VERBOSE=y
478# CONFIG_IP_PNP is not set 482CONFIG_IP_PNP=y
483CONFIG_IP_PNP_DHCP=y
484CONFIG_IP_PNP_BOOTP=y
485CONFIG_IP_PNP_RARP=y
479# CONFIG_NET_IPIP is not set 486# CONFIG_NET_IPIP is not set
480# CONFIG_NET_IPGRE is not set 487# CONFIG_NET_IPGRE is not set
481CONFIG_IP_MROUTE=y 488CONFIG_IP_MROUTE=y
@@ -618,7 +625,6 @@ CONFIG_NET_SCHED=y
618# CONFIG_NET_SCH_HTB is not set 625# CONFIG_NET_SCH_HTB is not set
619# CONFIG_NET_SCH_HFSC is not set 626# CONFIG_NET_SCH_HFSC is not set
620# CONFIG_NET_SCH_PRIO is not set 627# CONFIG_NET_SCH_PRIO is not set
621# CONFIG_NET_SCH_RR is not set
622# CONFIG_NET_SCH_RED is not set 628# CONFIG_NET_SCH_RED is not set
623# CONFIG_NET_SCH_SFQ is not set 629# CONFIG_NET_SCH_SFQ is not set
624# CONFIG_NET_SCH_TEQL is not set 630# CONFIG_NET_SCH_TEQL is not set
@@ -680,28 +686,19 @@ CONFIG_FIB_RULES=y
680CONFIG_CFG80211=y 686CONFIG_CFG80211=y
681CONFIG_NL80211=y 687CONFIG_NL80211=y
682CONFIG_WIRELESS_EXT=y 688CONFIG_WIRELESS_EXT=y
689CONFIG_WIRELESS_EXT_SYSFS=y
683CONFIG_MAC80211=y 690CONFIG_MAC80211=y
684 691
685# 692#
686# Rate control algorithm selection 693# Rate control algorithm selection
687# 694#
695CONFIG_MAC80211_RC_PID=y
688CONFIG_MAC80211_RC_DEFAULT_PID=y 696CONFIG_MAC80211_RC_DEFAULT_PID=y
689# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
690
691#
692# Selecting 'y' for an algorithm will
693#
694
695#
696# build the algorithm into mac80211.
697#
698CONFIG_MAC80211_RC_DEFAULT="pid" 697CONFIG_MAC80211_RC_DEFAULT="pid"
699CONFIG_MAC80211_RC_PID=y
700# CONFIG_MAC80211_MESH is not set 698# CONFIG_MAC80211_MESH is not set
701CONFIG_MAC80211_LEDS=y 699CONFIG_MAC80211_LEDS=y
702# CONFIG_MAC80211_DEBUGFS is not set 700# CONFIG_MAC80211_DEBUGFS is not set
703# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 701# CONFIG_MAC80211_DEBUG_MENU is not set
704# CONFIG_MAC80211_DEBUG is not set
705# CONFIG_IEEE80211 is not set 702# CONFIG_IEEE80211 is not set
706# CONFIG_RFKILL is not set 703# CONFIG_RFKILL is not set
707# CONFIG_NET_9P is not set 704# CONFIG_NET_9P is not set
@@ -717,6 +714,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
717CONFIG_STANDALONE=y 714CONFIG_STANDALONE=y
718CONFIG_PREVENT_FIRMWARE_BUILD=y 715CONFIG_PREVENT_FIRMWARE_BUILD=y
719CONFIG_FW_LOADER=y 716CONFIG_FW_LOADER=y
717CONFIG_FIRMWARE_IN_KERNEL=y
718CONFIG_EXTRA_FIRMWARE=""
720# CONFIG_DEBUG_DRIVER is not set 719# CONFIG_DEBUG_DRIVER is not set
721CONFIG_DEBUG_DEVRES=y 720CONFIG_DEBUG_DEVRES=y
722# CONFIG_SYS_HYPERVISOR is not set 721# CONFIG_SYS_HYPERVISOR is not set
@@ -749,6 +748,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
749# CONFIG_BLK_DEV_XIP is not set 748# CONFIG_BLK_DEV_XIP is not set
750# CONFIG_CDROM_PKTCDVD is not set 749# CONFIG_CDROM_PKTCDVD is not set
751# CONFIG_ATA_OVER_ETH is not set 750# CONFIG_ATA_OVER_ETH is not set
751# CONFIG_BLK_DEV_HD is not set
752CONFIG_MISC_DEVICES=y 752CONFIG_MISC_DEVICES=y
753# CONFIG_IBM_ASM is not set 753# CONFIG_IBM_ASM is not set
754# CONFIG_PHANTOM is not set 754# CONFIG_PHANTOM is not set
@@ -760,10 +760,12 @@ CONFIG_MISC_DEVICES=y
760# CONFIG_FUJITSU_LAPTOP is not set 760# CONFIG_FUJITSU_LAPTOP is not set
761# CONFIG_TC1100_WMI is not set 761# CONFIG_TC1100_WMI is not set
762# CONFIG_MSI_LAPTOP is not set 762# CONFIG_MSI_LAPTOP is not set
763# CONFIG_COMPAL_LAPTOP is not set
763# CONFIG_SONY_LAPTOP is not set 764# CONFIG_SONY_LAPTOP is not set
764# CONFIG_THINKPAD_ACPI is not set 765# CONFIG_THINKPAD_ACPI is not set
765# CONFIG_INTEL_MENLOW is not set 766# CONFIG_INTEL_MENLOW is not set
766# CONFIG_ENCLOSURE_SERVICES is not set 767# CONFIG_ENCLOSURE_SERVICES is not set
768# CONFIG_HP_ILO is not set
767CONFIG_HAVE_IDE=y 769CONFIG_HAVE_IDE=y
768# CONFIG_IDE is not set 770# CONFIG_IDE is not set
769 771
@@ -802,12 +804,13 @@ CONFIG_SCSI_WAIT_SCAN=m
802# 804#
803CONFIG_SCSI_SPI_ATTRS=y 805CONFIG_SCSI_SPI_ATTRS=y
804# CONFIG_SCSI_FC_ATTRS is not set 806# CONFIG_SCSI_FC_ATTRS is not set
805# CONFIG_SCSI_ISCSI_ATTRS is not set 807CONFIG_SCSI_ISCSI_ATTRS=y
806# CONFIG_SCSI_SAS_ATTRS is not set 808# CONFIG_SCSI_SAS_ATTRS is not set
807# CONFIG_SCSI_SAS_LIBSAS is not set 809# CONFIG_SCSI_SAS_LIBSAS is not set
808# CONFIG_SCSI_SRP_ATTRS is not set 810# CONFIG_SCSI_SRP_ATTRS is not set
809# CONFIG_SCSI_LOWLEVEL is not set 811# CONFIG_SCSI_LOWLEVEL is not set
810# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 812# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
813# CONFIG_SCSI_DH is not set
811CONFIG_ATA=y 814CONFIG_ATA=y
812# CONFIG_ATA_NONSTANDARD is not set 815# CONFIG_ATA_NONSTANDARD is not set
813CONFIG_ATA_ACPI=y 816CONFIG_ATA_ACPI=y
@@ -842,7 +845,7 @@ CONFIG_PATA_AMD=y
842# CONFIG_PATA_CS5536 is not set 845# CONFIG_PATA_CS5536 is not set
843# CONFIG_PATA_CYPRESS is not set 846# CONFIG_PATA_CYPRESS is not set
844# CONFIG_PATA_EFAR is not set 847# CONFIG_PATA_EFAR is not set
845# CONFIG_ATA_GENERIC is not set 848CONFIG_ATA_GENERIC=y
846# CONFIG_PATA_HPT366 is not set 849# CONFIG_PATA_HPT366 is not set
847# CONFIG_PATA_HPT37X is not set 850# CONFIG_PATA_HPT37X is not set
848# CONFIG_PATA_HPT3X2N is not set 851# CONFIG_PATA_HPT3X2N is not set
@@ -852,7 +855,7 @@ CONFIG_PATA_AMD=y
852# CONFIG_PATA_JMICRON is not set 855# CONFIG_PATA_JMICRON is not set
853# CONFIG_PATA_TRIFLEX is not set 856# CONFIG_PATA_TRIFLEX is not set
854# CONFIG_PATA_MARVELL is not set 857# CONFIG_PATA_MARVELL is not set
855# CONFIG_PATA_MPIIX is not set 858CONFIG_PATA_MPIIX=y
856CONFIG_PATA_OLDPIIX=y 859CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_NETCELL is not set 860# CONFIG_PATA_NETCELL is not set
858# CONFIG_PATA_NINJA32 is not set 861# CONFIG_PATA_NINJA32 is not set
@@ -871,6 +874,7 @@ CONFIG_PATA_OLDPIIX=y
871# CONFIG_PATA_SIS is not set 874# CONFIG_PATA_SIS is not set
872# CONFIG_PATA_VIA is not set 875# CONFIG_PATA_VIA is not set
873# CONFIG_PATA_WINBOND is not set 876# CONFIG_PATA_WINBOND is not set
877CONFIG_PATA_SCH=y
874CONFIG_MD=y 878CONFIG_MD=y
875CONFIG_BLK_DEV_MD=y 879CONFIG_BLK_DEV_MD=y
876# CONFIG_MD_LINEAR is not set 880# CONFIG_MD_LINEAR is not set
@@ -894,13 +898,16 @@ CONFIG_DM_ZERO=y
894# 898#
895# IEEE 1394 (FireWire) support 899# IEEE 1394 (FireWire) support
896# 900#
901
902#
903# Enable only one of the two stacks, unless you know what you are doing
904#
897# CONFIG_FIREWIRE is not set 905# CONFIG_FIREWIRE is not set
898# CONFIG_IEEE1394 is not set 906# CONFIG_IEEE1394 is not set
899# CONFIG_I2O is not set 907# CONFIG_I2O is not set
900CONFIG_MACINTOSH_DRIVERS=y 908CONFIG_MACINTOSH_DRIVERS=y
901CONFIG_MAC_EMUMOUSEBTN=y 909CONFIG_MAC_EMUMOUSEBTN=y
902CONFIG_NETDEVICES=y 910CONFIG_NETDEVICES=y
903# CONFIG_NETDEVICES_MULTIQUEUE is not set
904# CONFIG_IFB is not set 911# CONFIG_IFB is not set
905# CONFIG_DUMMY is not set 912# CONFIG_DUMMY is not set
906# CONFIG_BONDING is not set 913# CONFIG_BONDING is not set
@@ -910,7 +917,23 @@ CONFIG_NETDEVICES=y
910# CONFIG_VETH is not set 917# CONFIG_VETH is not set
911# CONFIG_NET_SB1000 is not set 918# CONFIG_NET_SB1000 is not set
912# CONFIG_ARCNET is not set 919# CONFIG_ARCNET is not set
913# CONFIG_PHYLIB is not set 920CONFIG_PHYLIB=y
921
922#
923# MII PHY device drivers
924#
925# CONFIG_MARVELL_PHY is not set
926# CONFIG_DAVICOM_PHY is not set
927# CONFIG_QSEMI_PHY is not set
928# CONFIG_LXT_PHY is not set
929# CONFIG_CICADA_PHY is not set
930# CONFIG_VITESSE_PHY is not set
931# CONFIG_SMSC_PHY is not set
932# CONFIG_BROADCOM_PHY is not set
933# CONFIG_ICPLUS_PHY is not set
934# CONFIG_REALTEK_PHY is not set
935# CONFIG_FIXED_PHY is not set
936# CONFIG_MDIO_BITBANG is not set
914CONFIG_NET_ETHERNET=y 937CONFIG_NET_ETHERNET=y
915CONFIG_MII=y 938CONFIG_MII=y
916# CONFIG_HAPPYMEAL is not set 939# CONFIG_HAPPYMEAL is not set
@@ -943,10 +966,10 @@ CONFIG_FORCEDETH=y
943CONFIG_E100=y 966CONFIG_E100=y
944# CONFIG_FEALNX is not set 967# CONFIG_FEALNX is not set
945# CONFIG_NATSEMI is not set 968# CONFIG_NATSEMI is not set
946# CONFIG_NE2K_PCI is not set 969CONFIG_NE2K_PCI=y
947# CONFIG_8139CP is not set 970# CONFIG_8139CP is not set
948CONFIG_8139TOO=y 971CONFIG_8139TOO=y
949CONFIG_8139TOO_PIO=y 972# CONFIG_8139TOO_PIO is not set
950# CONFIG_8139TOO_TUNE_TWISTER is not set 973# CONFIG_8139TOO_TUNE_TWISTER is not set
951# CONFIG_8139TOO_8129 is not set 974# CONFIG_8139TOO_8129 is not set
952# CONFIG_8139_OLD_RX_RESET is not set 975# CONFIG_8139_OLD_RX_RESET is not set
@@ -961,25 +984,24 @@ CONFIG_NETDEV_1000=y
961# CONFIG_ACENIC is not set 984# CONFIG_ACENIC is not set
962# CONFIG_DL2K is not set 985# CONFIG_DL2K is not set
963CONFIG_E1000=y 986CONFIG_E1000=y
964# CONFIG_E1000_NAPI is not set
965# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 987# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
966# CONFIG_E1000E is not set 988CONFIG_E1000E=y
967# CONFIG_E1000E_ENABLED is not set
968# CONFIG_IP1000 is not set 989# CONFIG_IP1000 is not set
969# CONFIG_IGB is not set 990# CONFIG_IGB is not set
970# CONFIG_NS83820 is not set 991# CONFIG_NS83820 is not set
971# CONFIG_HAMACHI is not set 992# CONFIG_HAMACHI is not set
972# CONFIG_YELLOWFIN is not set 993# CONFIG_YELLOWFIN is not set
973# CONFIG_R8169 is not set 994CONFIG_R8169=y
974# CONFIG_SIS190 is not set 995# CONFIG_SIS190 is not set
975# CONFIG_SKGE is not set 996# CONFIG_SKGE is not set
976CONFIG_SKY2=y 997CONFIG_SKY2=y
977# CONFIG_SKY2_DEBUG is not set 998# CONFIG_SKY2_DEBUG is not set
978# CONFIG_VIA_VELOCITY is not set 999# CONFIG_VIA_VELOCITY is not set
979CONFIG_TIGON3=y 1000CONFIG_TIGON3=y
980# CONFIG_BNX2 is not set 1001CONFIG_BNX2=y
981# CONFIG_QLA3XXX is not set 1002# CONFIG_QLA3XXX is not set
982# CONFIG_ATL1 is not set 1003# CONFIG_ATL1 is not set
1004# CONFIG_ATL1E is not set
983CONFIG_NETDEV_10000=y 1005CONFIG_NETDEV_10000=y
984# CONFIG_CHELSIO_T1 is not set 1006# CONFIG_CHELSIO_T1 is not set
985# CONFIG_CHELSIO_T3 is not set 1007# CONFIG_CHELSIO_T3 is not set
@@ -1019,13 +1041,14 @@ CONFIG_WLAN_80211=y
1019# CONFIG_RTL8180 is not set 1041# CONFIG_RTL8180 is not set
1020# CONFIG_RTL8187 is not set 1042# CONFIG_RTL8187 is not set
1021# CONFIG_ADM8211 is not set 1043# CONFIG_ADM8211 is not set
1044# CONFIG_MAC80211_HWSIM is not set
1022# CONFIG_P54_COMMON is not set 1045# CONFIG_P54_COMMON is not set
1023CONFIG_ATH5K=y 1046CONFIG_ATH5K=y
1024# CONFIG_ATH5K_DEBUG is not set 1047# CONFIG_ATH5K_DEBUG is not set
1025# CONFIG_IWLWIFI is not set 1048# CONFIG_ATH9K is not set
1026# CONFIG_IWLCORE is not set 1049# CONFIG_IWLCORE is not set
1027# CONFIG_IWLWIFI_LEDS is not set 1050# CONFIG_IWLWIFI_LEDS is not set
1028# CONFIG_IWL4965 is not set 1051# CONFIG_IWLAGN is not set
1029# CONFIG_IWL3945 is not set 1052# CONFIG_IWL3945 is not set
1030# CONFIG_HOSTAP is not set 1053# CONFIG_HOSTAP is not set
1031# CONFIG_B43 is not set 1054# CONFIG_B43 is not set
@@ -1105,6 +1128,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1105# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1128# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1106# CONFIG_MOUSE_SERIAL is not set 1129# CONFIG_MOUSE_SERIAL is not set
1107# CONFIG_MOUSE_APPLETOUCH is not set 1130# CONFIG_MOUSE_APPLETOUCH is not set
1131# CONFIG_MOUSE_BCM5974 is not set
1108# CONFIG_MOUSE_VSXXXAA is not set 1132# CONFIG_MOUSE_VSXXXAA is not set
1109CONFIG_INPUT_JOYSTICK=y 1133CONFIG_INPUT_JOYSTICK=y
1110# CONFIG_JOYSTICK_ANALOG is not set 1134# CONFIG_JOYSTICK_ANALOG is not set
@@ -1139,12 +1163,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1139# CONFIG_TOUCHSCREEN_GUNZE is not set 1163# CONFIG_TOUCHSCREEN_GUNZE is not set
1140# CONFIG_TOUCHSCREEN_ELO is not set 1164# CONFIG_TOUCHSCREEN_ELO is not set
1141# CONFIG_TOUCHSCREEN_MTOUCH is not set 1165# CONFIG_TOUCHSCREEN_MTOUCH is not set
1166# CONFIG_TOUCHSCREEN_INEXIO is not set
1142# CONFIG_TOUCHSCREEN_MK712 is not set 1167# CONFIG_TOUCHSCREEN_MK712 is not set
1143# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1168# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1144# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1169# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1145# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1170# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1146# CONFIG_TOUCHSCREEN_UCB1400 is not set 1171# CONFIG_TOUCHSCREEN_UCB1400 is not set
1147# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1172# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1173# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1148CONFIG_INPUT_MISC=y 1174CONFIG_INPUT_MISC=y
1149# CONFIG_INPUT_PCSPKR is not set 1175# CONFIG_INPUT_PCSPKR is not set
1150# CONFIG_INPUT_APANEL is not set 1176# CONFIG_INPUT_APANEL is not set
@@ -1173,6 +1199,7 @@ CONFIG_SERIO_LIBPS2=y
1173# Character devices 1199# Character devices
1174# 1200#
1175CONFIG_VT=y 1201CONFIG_VT=y
1202CONFIG_CONSOLE_TRANSLATIONS=y
1176CONFIG_VT_CONSOLE=y 1203CONFIG_VT_CONSOLE=y
1177CONFIG_HW_CONSOLE=y 1204CONFIG_HW_CONSOLE=y
1178CONFIG_VT_HW_CONSOLE_BINDING=y 1205CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1223,8 +1250,8 @@ CONFIG_UNIX98_PTYS=y
1223# CONFIG_LEGACY_PTYS is not set 1250# CONFIG_LEGACY_PTYS is not set
1224# CONFIG_IPMI_HANDLER is not set 1251# CONFIG_IPMI_HANDLER is not set
1225CONFIG_HW_RANDOM=y 1252CONFIG_HW_RANDOM=y
1226# CONFIG_HW_RANDOM_INTEL is not set 1253CONFIG_HW_RANDOM_INTEL=y
1227# CONFIG_HW_RANDOM_AMD is not set 1254CONFIG_HW_RANDOM_AMD=y
1228CONFIG_HW_RANDOM_GEODE=y 1255CONFIG_HW_RANDOM_GEODE=y
1229CONFIG_HW_RANDOM_VIA=y 1256CONFIG_HW_RANDOM_VIA=y
1230CONFIG_NVRAM=y 1257CONFIG_NVRAM=y
@@ -1245,7 +1272,6 @@ CONFIG_NVRAM=y
1245# CONFIG_CS5535_GPIO is not set 1272# CONFIG_CS5535_GPIO is not set
1246# CONFIG_RAW_DRIVER is not set 1273# CONFIG_RAW_DRIVER is not set
1247CONFIG_HPET=y 1274CONFIG_HPET=y
1248# CONFIG_HPET_RTC_IRQ is not set
1249# CONFIG_HPET_MMAP is not set 1275# CONFIG_HPET_MMAP is not set
1250# CONFIG_HANGCHECK_TIMER is not set 1276# CONFIG_HANGCHECK_TIMER is not set
1251# CONFIG_TCG_TPM is not set 1277# CONFIG_TCG_TPM is not set
@@ -1254,43 +1280,64 @@ CONFIG_DEVPORT=y
1254CONFIG_I2C=y 1280CONFIG_I2C=y
1255CONFIG_I2C_BOARDINFO=y 1281CONFIG_I2C_BOARDINFO=y
1256# CONFIG_I2C_CHARDEV is not set 1282# CONFIG_I2C_CHARDEV is not set
1283CONFIG_I2C_HELPER_AUTO=y
1257 1284
1258# 1285#
1259# I2C Hardware Bus support 1286# I2C Hardware Bus support
1260# 1287#
1288
1289#
1290# PC SMBus host controller drivers
1291#
1261# CONFIG_I2C_ALI1535 is not set 1292# CONFIG_I2C_ALI1535 is not set
1262# CONFIG_I2C_ALI1563 is not set 1293# CONFIG_I2C_ALI1563 is not set
1263# CONFIG_I2C_ALI15X3 is not set 1294# CONFIG_I2C_ALI15X3 is not set
1264# CONFIG_I2C_AMD756 is not set 1295# CONFIG_I2C_AMD756 is not set
1265# CONFIG_I2C_AMD8111 is not set 1296# CONFIG_I2C_AMD8111 is not set
1266CONFIG_I2C_I801=y 1297CONFIG_I2C_I801=y
1267# CONFIG_I2C_I810 is not set 1298# CONFIG_I2C_ISCH is not set
1268# CONFIG_I2C_PIIX4 is not set 1299# CONFIG_I2C_PIIX4 is not set
1269# CONFIG_I2C_NFORCE2 is not set 1300# CONFIG_I2C_NFORCE2 is not set
1270# CONFIG_I2C_OCORES is not set
1271# CONFIG_I2C_PARPORT_LIGHT is not set
1272# CONFIG_I2C_PROSAVAGE is not set
1273# CONFIG_I2C_SAVAGE4 is not set
1274# CONFIG_I2C_SIMTEC is not set
1275# CONFIG_SCx200_ACB is not set
1276# CONFIG_I2C_SIS5595 is not set 1301# CONFIG_I2C_SIS5595 is not set
1277# CONFIG_I2C_SIS630 is not set 1302# CONFIG_I2C_SIS630 is not set
1278# CONFIG_I2C_SIS96X is not set 1303# CONFIG_I2C_SIS96X is not set
1279# CONFIG_I2C_TAOS_EVM is not set
1280# CONFIG_I2C_STUB is not set
1281# CONFIG_I2C_TINY_USB is not set
1282# CONFIG_I2C_VIA is not set 1304# CONFIG_I2C_VIA is not set
1283# CONFIG_I2C_VIAPRO is not set 1305# CONFIG_I2C_VIAPRO is not set
1306
1307#
1308# I2C system bus drivers (mostly embedded / system-on-chip)
1309#
1310# CONFIG_I2C_OCORES is not set
1311# CONFIG_I2C_SIMTEC is not set
1312
1313#
1314# External I2C/SMBus adapter drivers
1315#
1316# CONFIG_I2C_PARPORT_LIGHT is not set
1317# CONFIG_I2C_TAOS_EVM is not set
1318# CONFIG_I2C_TINY_USB is not set
1319
1320#
1321# Graphics adapter I2C/DDC channel drivers
1322#
1284# CONFIG_I2C_VOODOO3 is not set 1323# CONFIG_I2C_VOODOO3 is not set
1324
1325#
1326# Other I2C/SMBus bus drivers
1327#
1285# CONFIG_I2C_PCA_PLATFORM is not set 1328# CONFIG_I2C_PCA_PLATFORM is not set
1329# CONFIG_I2C_STUB is not set
1330# CONFIG_SCx200_ACB is not set
1286 1331
1287# 1332#
1288# Miscellaneous I2C Chip support 1333# Miscellaneous I2C Chip support
1289# 1334#
1290# CONFIG_DS1682 is not set 1335# CONFIG_DS1682 is not set
1336# CONFIG_AT24 is not set
1291# CONFIG_SENSORS_EEPROM is not set 1337# CONFIG_SENSORS_EEPROM is not set
1292# CONFIG_SENSORS_PCF8574 is not set 1338# CONFIG_SENSORS_PCF8574 is not set
1293# CONFIG_PCF8575 is not set 1339# CONFIG_PCF8575 is not set
1340# CONFIG_SENSORS_PCA9539 is not set
1294# CONFIG_SENSORS_PCF8591 is not set 1341# CONFIG_SENSORS_PCF8591 is not set
1295# CONFIG_SENSORS_MAX6875 is not set 1342# CONFIG_SENSORS_MAX6875 is not set
1296# CONFIG_SENSORS_TSL2550 is not set 1343# CONFIG_SENSORS_TSL2550 is not set
@@ -1299,6 +1346,8 @@ CONFIG_I2C_I801=y
1299# CONFIG_I2C_DEBUG_BUS is not set 1346# CONFIG_I2C_DEBUG_BUS is not set
1300# CONFIG_I2C_DEBUG_CHIP is not set 1347# CONFIG_I2C_DEBUG_CHIP is not set
1301# CONFIG_SPI is not set 1348# CONFIG_SPI is not set
1349CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1350# CONFIG_GPIOLIB is not set
1302# CONFIG_W1 is not set 1351# CONFIG_W1 is not set
1303CONFIG_POWER_SUPPLY=y 1352CONFIG_POWER_SUPPLY=y
1304# CONFIG_POWER_SUPPLY_DEBUG is not set 1353# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1360,8 +1409,10 @@ CONFIG_SSB_POSSIBLE=y
1360# 1409#
1361# Multifunction device drivers 1410# Multifunction device drivers
1362# 1411#
1412# CONFIG_MFD_CORE is not set
1363# CONFIG_MFD_SM501 is not set 1413# CONFIG_MFD_SM501 is not set
1364# CONFIG_HTC_PASIC3 is not set 1414# CONFIG_HTC_PASIC3 is not set
1415# CONFIG_MFD_TMIO is not set
1365 1416
1366# 1417#
1367# Multimedia devices 1418# Multimedia devices
@@ -1372,6 +1423,7 @@ CONFIG_SSB_POSSIBLE=y
1372# 1423#
1373# CONFIG_VIDEO_DEV is not set 1424# CONFIG_VIDEO_DEV is not set
1374# CONFIG_DVB_CORE is not set 1425# CONFIG_DVB_CORE is not set
1426# CONFIG_VIDEO_MEDIA is not set
1375 1427
1376# 1428#
1377# Multimedia drivers 1429# Multimedia drivers
@@ -1418,7 +1470,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1418# CONFIG_FB_SYS_IMAGEBLIT is not set 1470# CONFIG_FB_SYS_IMAGEBLIT is not set
1419# CONFIG_FB_FOREIGN_ENDIAN is not set 1471# CONFIG_FB_FOREIGN_ENDIAN is not set
1420# CONFIG_FB_SYS_FOPS is not set 1472# CONFIG_FB_SYS_FOPS is not set
1421CONFIG_FB_DEFERRED_IO=y
1422# CONFIG_FB_SVGALIB is not set 1473# CONFIG_FB_SVGALIB is not set
1423# CONFIG_FB_MACMODES is not set 1474# CONFIG_FB_MACMODES is not set
1424# CONFIG_FB_BACKLIGHT is not set 1475# CONFIG_FB_BACKLIGHT is not set
@@ -1463,6 +1514,7 @@ CONFIG_FB_EFI=y
1463# CONFIG_FB_TRIDENT is not set 1514# CONFIG_FB_TRIDENT is not set
1464# CONFIG_FB_ARK is not set 1515# CONFIG_FB_ARK is not set
1465# CONFIG_FB_PM3 is not set 1516# CONFIG_FB_PM3 is not set
1517# CONFIG_FB_CARMINE is not set
1466# CONFIG_FB_GEODE is not set 1518# CONFIG_FB_GEODE is not set
1467# CONFIG_FB_VIRTUAL is not set 1519# CONFIG_FB_VIRTUAL is not set
1468CONFIG_BACKLIGHT_LCD_SUPPORT=y 1520CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1470,6 +1522,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1470CONFIG_BACKLIGHT_CLASS_DEVICE=y 1522CONFIG_BACKLIGHT_CLASS_DEVICE=y
1471# CONFIG_BACKLIGHT_CORGI is not set 1523# CONFIG_BACKLIGHT_CORGI is not set
1472# CONFIG_BACKLIGHT_PROGEAR is not set 1524# CONFIG_BACKLIGHT_PROGEAR is not set
1525# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1473 1526
1474# 1527#
1475# Display device support 1528# Display device support
@@ -1482,22 +1535,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1482CONFIG_VGA_CONSOLE=y 1535CONFIG_VGA_CONSOLE=y
1483CONFIG_VGACON_SOFT_SCROLLBACK=y 1536CONFIG_VGACON_SOFT_SCROLLBACK=y
1484CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1537CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1485CONFIG_VIDEO_SELECT=y
1486CONFIG_DUMMY_CONSOLE=y 1538CONFIG_DUMMY_CONSOLE=y
1487# CONFIG_FRAMEBUFFER_CONSOLE is not set 1539# CONFIG_FRAMEBUFFER_CONSOLE is not set
1488CONFIG_LOGO=y 1540CONFIG_LOGO=y
1489# CONFIG_LOGO_LINUX_MONO is not set 1541# CONFIG_LOGO_LINUX_MONO is not set
1490# CONFIG_LOGO_LINUX_VGA16 is not set 1542# CONFIG_LOGO_LINUX_VGA16 is not set
1491CONFIG_LOGO_LINUX_CLUT224=y 1543CONFIG_LOGO_LINUX_CLUT224=y
1492
1493#
1494# Sound
1495#
1496CONFIG_SOUND=y 1544CONFIG_SOUND=y
1497
1498#
1499# Advanced Linux Sound Architecture
1500#
1501CONFIG_SND=y 1545CONFIG_SND=y
1502CONFIG_SND_TIMER=y 1546CONFIG_SND_TIMER=y
1503CONFIG_SND_PCM=y 1547CONFIG_SND_PCM=y
@@ -1515,20 +1559,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1515# CONFIG_SND_VERBOSE_PRINTK is not set 1559# CONFIG_SND_VERBOSE_PRINTK is not set
1516# CONFIG_SND_DEBUG is not set 1560# CONFIG_SND_DEBUG is not set
1517CONFIG_SND_VMASTER=y 1561CONFIG_SND_VMASTER=y
1518 1562CONFIG_SND_DRIVERS=y
1519#
1520# Generic devices
1521#
1522# CONFIG_SND_PCSP is not set 1563# CONFIG_SND_PCSP is not set
1523# CONFIG_SND_DUMMY is not set 1564# CONFIG_SND_DUMMY is not set
1524# CONFIG_SND_VIRMIDI is not set 1565# CONFIG_SND_VIRMIDI is not set
1525# CONFIG_SND_MTPAV is not set 1566# CONFIG_SND_MTPAV is not set
1526# CONFIG_SND_SERIAL_U16550 is not set 1567# CONFIG_SND_SERIAL_U16550 is not set
1527# CONFIG_SND_MPU401 is not set 1568# CONFIG_SND_MPU401 is not set
1528 1569CONFIG_SND_PCI=y
1529#
1530# PCI devices
1531#
1532# CONFIG_SND_AD1889 is not set 1570# CONFIG_SND_AD1889 is not set
1533# CONFIG_SND_ALS300 is not set 1571# CONFIG_SND_ALS300 is not set
1534# CONFIG_SND_ALS4000 is not set 1572# CONFIG_SND_ALS4000 is not set
@@ -1603,36 +1641,14 @@ CONFIG_SND_HDA_GENERIC=y
1603# CONFIG_SND_VIRTUOSO is not set 1641# CONFIG_SND_VIRTUOSO is not set
1604# CONFIG_SND_VX222 is not set 1642# CONFIG_SND_VX222 is not set
1605# CONFIG_SND_YMFPCI is not set 1643# CONFIG_SND_YMFPCI is not set
1606 1644CONFIG_SND_USB=y
1607#
1608# USB devices
1609#
1610# CONFIG_SND_USB_AUDIO is not set 1645# CONFIG_SND_USB_AUDIO is not set
1611# CONFIG_SND_USB_USX2Y is not set 1646# CONFIG_SND_USB_USX2Y is not set
1612# CONFIG_SND_USB_CAIAQ is not set 1647# CONFIG_SND_USB_CAIAQ is not set
1613 1648CONFIG_SND_PCMCIA=y
1614#
1615# PCMCIA devices
1616#
1617# CONFIG_SND_VXPOCKET is not set 1649# CONFIG_SND_VXPOCKET is not set
1618# CONFIG_SND_PDAUDIOCF is not set 1650# CONFIG_SND_PDAUDIOCF is not set
1619
1620#
1621# System on Chip audio support
1622#
1623# CONFIG_SND_SOC is not set 1651# CONFIG_SND_SOC is not set
1624
1625#
1626# ALSA SoC audio for Freescale SOCs
1627#
1628
1629#
1630# SoC Audio for the Texas Instruments OMAP
1631#
1632
1633#
1634# Open Sound System
1635#
1636# CONFIG_SOUND_PRIME is not set 1652# CONFIG_SOUND_PRIME is not set
1637CONFIG_HID_SUPPORT=y 1653CONFIG_HID_SUPPORT=y
1638CONFIG_HID=y 1654CONFIG_HID=y
@@ -1668,6 +1684,7 @@ CONFIG_USB_DEVICEFS=y
1668# CONFIG_USB_DYNAMIC_MINORS is not set 1684# CONFIG_USB_DYNAMIC_MINORS is not set
1669CONFIG_USB_SUSPEND=y 1685CONFIG_USB_SUSPEND=y
1670# CONFIG_USB_OTG is not set 1686# CONFIG_USB_OTG is not set
1687CONFIG_USB_MON=y
1671 1688
1672# 1689#
1673# USB Host Controller Drivers 1690# USB Host Controller Drivers
@@ -1691,6 +1708,7 @@ CONFIG_USB_UHCI_HCD=y
1691# 1708#
1692# CONFIG_USB_ACM is not set 1709# CONFIG_USB_ACM is not set
1693CONFIG_USB_PRINTER=y 1710CONFIG_USB_PRINTER=y
1711# CONFIG_USB_WDM is not set
1694 1712
1695# 1713#
1696# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1714# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1712,6 +1730,7 @@ CONFIG_USB_STORAGE=y
1712# CONFIG_USB_STORAGE_ALAUDA is not set 1730# CONFIG_USB_STORAGE_ALAUDA is not set
1713# CONFIG_USB_STORAGE_ONETOUCH is not set 1731# CONFIG_USB_STORAGE_ONETOUCH is not set
1714# CONFIG_USB_STORAGE_KARMA is not set 1732# CONFIG_USB_STORAGE_KARMA is not set
1733# CONFIG_USB_STORAGE_SIERRA is not set
1715# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1734# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1716CONFIG_USB_LIBUSUAL=y 1735CONFIG_USB_LIBUSUAL=y
1717 1736
@@ -1720,7 +1739,6 @@ CONFIG_USB_LIBUSUAL=y
1720# 1739#
1721# CONFIG_USB_MDC800 is not set 1740# CONFIG_USB_MDC800 is not set
1722# CONFIG_USB_MICROTEK is not set 1741# CONFIG_USB_MICROTEK is not set
1723CONFIG_USB_MON=y
1724 1742
1725# 1743#
1726# USB port drivers 1744# USB port drivers
@@ -1733,7 +1751,6 @@ CONFIG_USB_MON=y
1733# CONFIG_USB_EMI62 is not set 1751# CONFIG_USB_EMI62 is not set
1734# CONFIG_USB_EMI26 is not set 1752# CONFIG_USB_EMI26 is not set
1735# CONFIG_USB_ADUTUX is not set 1753# CONFIG_USB_ADUTUX is not set
1736# CONFIG_USB_AUERSWALD is not set
1737# CONFIG_USB_RIO500 is not set 1754# CONFIG_USB_RIO500 is not set
1738# CONFIG_USB_LEGOTOWER is not set 1755# CONFIG_USB_LEGOTOWER is not set
1739# CONFIG_USB_LCD is not set 1756# CONFIG_USB_LCD is not set
@@ -1750,6 +1767,7 @@ CONFIG_USB_MON=y
1750# CONFIG_USB_TRANCEVIBRATOR is not set 1767# CONFIG_USB_TRANCEVIBRATOR is not set
1751# CONFIG_USB_IOWARRIOR is not set 1768# CONFIG_USB_IOWARRIOR is not set
1752# CONFIG_USB_TEST is not set 1769# CONFIG_USB_TEST is not set
1770# CONFIG_USB_ISIGHTFW is not set
1753# CONFIG_USB_GADGET is not set 1771# CONFIG_USB_GADGET is not set
1754# CONFIG_MMC is not set 1772# CONFIG_MMC is not set
1755# CONFIG_MEMSTICK is not set 1773# CONFIG_MEMSTICK is not set
@@ -1759,7 +1777,9 @@ CONFIG_LEDS_CLASS=y
1759# 1777#
1760# LED drivers 1778# LED drivers
1761# 1779#
1780# CONFIG_LEDS_PCA9532 is not set
1762# CONFIG_LEDS_CLEVO_MAIL is not set 1781# CONFIG_LEDS_CLEVO_MAIL is not set
1782# CONFIG_LEDS_PCA955X is not set
1763 1783
1764# 1784#
1765# LED Triggers 1785# LED Triggers
@@ -1805,6 +1825,7 @@ CONFIG_RTC_INTF_DEV=y
1805# CONFIG_RTC_DRV_PCF8583 is not set 1825# CONFIG_RTC_DRV_PCF8583 is not set
1806# CONFIG_RTC_DRV_M41T80 is not set 1826# CONFIG_RTC_DRV_M41T80 is not set
1807# CONFIG_RTC_DRV_S35390A is not set 1827# CONFIG_RTC_DRV_S35390A is not set
1828# CONFIG_RTC_DRV_FM3130 is not set
1808 1829
1809# 1830#
1810# SPI RTC drivers 1831# SPI RTC drivers
@@ -1837,11 +1858,13 @@ CONFIG_DMADEVICES=y
1837# Firmware Drivers 1858# Firmware Drivers
1838# 1859#
1839# CONFIG_EDD is not set 1860# CONFIG_EDD is not set
1861CONFIG_FIRMWARE_MEMMAP=y
1840CONFIG_EFI_VARS=y 1862CONFIG_EFI_VARS=y
1841# CONFIG_DELL_RBU is not set 1863# CONFIG_DELL_RBU is not set
1842# CONFIG_DCDBAS is not set 1864# CONFIG_DCDBAS is not set
1843CONFIG_DMIID=y 1865CONFIG_DMIID=y
1844# CONFIG_ISCSI_IBFT_FIND is not set 1866CONFIG_ISCSI_IBFT_FIND=y
1867CONFIG_ISCSI_IBFT=y
1845 1868
1846# 1869#
1847# File systems 1870# File systems
@@ -1920,14 +1943,27 @@ CONFIG_HUGETLB_PAGE=y
1920# CONFIG_CRAMFS is not set 1943# CONFIG_CRAMFS is not set
1921# CONFIG_VXFS_FS is not set 1944# CONFIG_VXFS_FS is not set
1922# CONFIG_MINIX_FS is not set 1945# CONFIG_MINIX_FS is not set
1946# CONFIG_OMFS_FS is not set
1923# CONFIG_HPFS_FS is not set 1947# CONFIG_HPFS_FS is not set
1924# CONFIG_QNX4FS_FS is not set 1948# CONFIG_QNX4FS_FS is not set
1925# CONFIG_ROMFS_FS is not set 1949# CONFIG_ROMFS_FS is not set
1926# CONFIG_SYSV_FS is not set 1950# CONFIG_SYSV_FS is not set
1927# CONFIG_UFS_FS is not set 1951# CONFIG_UFS_FS is not set
1928CONFIG_NETWORK_FILESYSTEMS=y 1952CONFIG_NETWORK_FILESYSTEMS=y
1929# CONFIG_NFS_FS is not set 1953CONFIG_NFS_FS=y
1954CONFIG_NFS_V3=y
1955CONFIG_NFS_V3_ACL=y
1956CONFIG_NFS_V4=y
1957CONFIG_ROOT_NFS=y
1930# CONFIG_NFSD is not set 1958# CONFIG_NFSD is not set
1959CONFIG_LOCKD=y
1960CONFIG_LOCKD_V4=y
1961CONFIG_NFS_ACL_SUPPORT=y
1962CONFIG_NFS_COMMON=y
1963CONFIG_SUNRPC=y
1964CONFIG_SUNRPC_GSS=y
1965CONFIG_RPCSEC_GSS_KRB5=y
1966# CONFIG_RPCSEC_GSS_SPKM3 is not set
1931# CONFIG_SMB_FS is not set 1967# CONFIG_SMB_FS is not set
1932# CONFIG_CIFS is not set 1968# CONFIG_CIFS is not set
1933# CONFIG_NCP_FS is not set 1969# CONFIG_NCP_FS is not set
@@ -2001,9 +2037,9 @@ CONFIG_NLS_UTF8=y
2001# Kernel hacking 2037# Kernel hacking
2002# 2038#
2003CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2039CONFIG_TRACE_IRQFLAGS_SUPPORT=y
2004# CONFIG_PRINTK_TIME is not set 2040CONFIG_PRINTK_TIME=y
2005# CONFIG_ENABLE_WARN_DEPRECATED is not set 2041CONFIG_ENABLE_WARN_DEPRECATED=y
2006# CONFIG_ENABLE_MUST_CHECK is not set 2042CONFIG_ENABLE_MUST_CHECK=y
2007CONFIG_FRAME_WARN=2048 2043CONFIG_FRAME_WARN=2048
2008CONFIG_MAGIC_SYSRQ=y 2044CONFIG_MAGIC_SYSRQ=y
2009# CONFIG_UNUSED_SYMBOLS is not set 2045# CONFIG_UNUSED_SYMBOLS is not set
@@ -2033,6 +2069,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
2033# CONFIG_DEBUG_INFO is not set 2069# CONFIG_DEBUG_INFO is not set
2034# CONFIG_DEBUG_VM is not set 2070# CONFIG_DEBUG_VM is not set
2035# CONFIG_DEBUG_WRITECOUNT is not set 2071# CONFIG_DEBUG_WRITECOUNT is not set
2072CONFIG_DEBUG_MEMORY_INIT=y
2036# CONFIG_DEBUG_LIST is not set 2073# CONFIG_DEBUG_LIST is not set
2037# CONFIG_DEBUG_SG is not set 2074# CONFIG_DEBUG_SG is not set
2038CONFIG_FRAME_POINTER=y 2075CONFIG_FRAME_POINTER=y
@@ -2043,23 +2080,32 @@ CONFIG_FRAME_POINTER=y
2043# CONFIG_LKDTM is not set 2080# CONFIG_LKDTM is not set
2044# CONFIG_FAULT_INJECTION is not set 2081# CONFIG_FAULT_INJECTION is not set
2045# CONFIG_LATENCYTOP is not set 2082# CONFIG_LATENCYTOP is not set
2083CONFIG_SYSCTL_SYSCALL_CHECK=y
2084CONFIG_HAVE_FTRACE=y
2085CONFIG_HAVE_DYNAMIC_FTRACE=y
2086# CONFIG_FTRACE is not set
2087# CONFIG_IRQSOFF_TRACER is not set
2088# CONFIG_SYSPROF_TRACER is not set
2089# CONFIG_SCHED_TRACER is not set
2090# CONFIG_CONTEXT_SWITCH_TRACER is not set
2046CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2091CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set 2092# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y 2093CONFIG_HAVE_ARCH_KGDB=y
2094# CONFIG_KGDB is not set
2050# CONFIG_STRICT_DEVMEM is not set 2095# CONFIG_STRICT_DEVMEM is not set
2096CONFIG_X86_VERBOSE_BOOTUP=y
2051CONFIG_EARLY_PRINTK=y 2097CONFIG_EARLY_PRINTK=y
2052CONFIG_DEBUG_STACKOVERFLOW=y 2098CONFIG_DEBUG_STACKOVERFLOW=y
2053CONFIG_DEBUG_STACK_USAGE=y 2099CONFIG_DEBUG_STACK_USAGE=y
2054# CONFIG_DEBUG_PAGEALLOC is not set 2100# CONFIG_DEBUG_PAGEALLOC is not set
2101# CONFIG_DEBUG_PER_CPU_MAPS is not set
2055# CONFIG_X86_PTDUMP is not set 2102# CONFIG_X86_PTDUMP is not set
2056CONFIG_DEBUG_RODATA=y 2103CONFIG_DEBUG_RODATA=y
2057# CONFIG_DEBUG_RODATA_TEST is not set 2104# CONFIG_DEBUG_RODATA_TEST is not set
2058CONFIG_DEBUG_NX_TEST=m 2105CONFIG_DEBUG_NX_TEST=m
2059# CONFIG_4KSTACKS is not set 2106# CONFIG_4KSTACKS is not set
2060CONFIG_X86_FIND_SMP_CONFIG=y
2061CONFIG_X86_MPPARSE=y
2062CONFIG_DOUBLEFAULT=y 2107CONFIG_DOUBLEFAULT=y
2108# CONFIG_MMIOTRACE is not set
2063CONFIG_IO_DELAY_TYPE_0X80=0 2109CONFIG_IO_DELAY_TYPE_0X80=0
2064CONFIG_IO_DELAY_TYPE_0XED=1 2110CONFIG_IO_DELAY_TYPE_0XED=1
2065CONFIG_IO_DELAY_TYPE_UDELAY=2 2111CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2071,6 +2117,7 @@ CONFIG_IO_DELAY_0X80=y
2071CONFIG_DEFAULT_IO_DELAY_TYPE=0 2117CONFIG_DEFAULT_IO_DELAY_TYPE=0
2072CONFIG_DEBUG_BOOT_PARAMS=y 2118CONFIG_DEBUG_BOOT_PARAMS=y
2073# CONFIG_CPA_DEBUG is not set 2119# CONFIG_CPA_DEBUG is not set
2120CONFIG_OPTIMIZE_INLINING=y
2074 2121
2075# 2122#
2076# Security options 2123# Security options
@@ -2080,7 +2127,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2080CONFIG_SECURITY=y 2127CONFIG_SECURITY=y
2081CONFIG_SECURITY_NETWORK=y 2128CONFIG_SECURITY_NETWORK=y
2082# CONFIG_SECURITY_NETWORK_XFRM is not set 2129# CONFIG_SECURITY_NETWORK_XFRM is not set
2083CONFIG_SECURITY_CAPABILITIES=y
2084CONFIG_SECURITY_FILE_CAPABILITIES=y 2130CONFIG_SECURITY_FILE_CAPABILITIES=y
2085# CONFIG_SECURITY_ROOTPLUG is not set 2131# CONFIG_SECURITY_ROOTPLUG is not set
2086CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2132CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2141,6 +2187,10 @@ CONFIG_CRYPTO_HMAC=y
2141# CONFIG_CRYPTO_MD4 is not set 2187# CONFIG_CRYPTO_MD4 is not set
2142CONFIG_CRYPTO_MD5=y 2188CONFIG_CRYPTO_MD5=y
2143# CONFIG_CRYPTO_MICHAEL_MIC is not set 2189# CONFIG_CRYPTO_MICHAEL_MIC is not set
2190# CONFIG_CRYPTO_RMD128 is not set
2191# CONFIG_CRYPTO_RMD160 is not set
2192# CONFIG_CRYPTO_RMD256 is not set
2193# CONFIG_CRYPTO_RMD320 is not set
2144CONFIG_CRYPTO_SHA1=y 2194CONFIG_CRYPTO_SHA1=y
2145# CONFIG_CRYPTO_SHA256 is not set 2195# CONFIG_CRYPTO_SHA256 is not set
2146# CONFIG_CRYPTO_SHA512 is not set 2196# CONFIG_CRYPTO_SHA512 is not set
@@ -2151,7 +2201,7 @@ CONFIG_CRYPTO_SHA1=y
2151# Ciphers 2201# Ciphers
2152# 2202#
2153CONFIG_CRYPTO_AES=y 2203CONFIG_CRYPTO_AES=y
2154# CONFIG_CRYPTO_AES_586 is not set 2204CONFIG_CRYPTO_AES_586=y
2155# CONFIG_CRYPTO_ANUBIS is not set 2205# CONFIG_CRYPTO_ANUBIS is not set
2156CONFIG_CRYPTO_ARC4=y 2206CONFIG_CRYPTO_ARC4=y
2157# CONFIG_CRYPTO_BLOWFISH is not set 2207# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2193,6 +2243,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2193CONFIG_GENERIC_FIND_NEXT_BIT=y 2243CONFIG_GENERIC_FIND_NEXT_BIT=y
2194# CONFIG_CRC_CCITT is not set 2244# CONFIG_CRC_CCITT is not set
2195# CONFIG_CRC16 is not set 2245# CONFIG_CRC16 is not set
2246CONFIG_CRC_T10DIF=y
2196# CONFIG_CRC_ITU_T is not set 2247# CONFIG_CRC_ITU_T is not set
2197CONFIG_CRC32=y 2248CONFIG_CRC32=y
2198# CONFIG_CRC7 is not set 2249# CONFIG_CRC7 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index a40452429625..2c4b1c771e28 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc5
4# Sun May 4 19:59:57 2008 4# Wed Sep 3 17:13:39 2008
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,25 +134,33 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152CONFIG_BLK_DEV_IO_TRACE=y 161CONFIG_BLK_DEV_IO_TRACE=y
153CONFIG_BLK_DEV_BSG=y 162CONFIG_BLK_DEV_BSG=y
163# CONFIG_BLK_DEV_INTEGRITY is not set
154CONFIG_BLOCK_COMPAT=y 164CONFIG_BLOCK_COMPAT=y
155 165
156# 166#
@@ -175,20 +185,15 @@ CONFIG_NO_HZ=y
175CONFIG_HIGH_RES_TIMERS=y 185CONFIG_HIGH_RES_TIMERS=y
176CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 186CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
177CONFIG_SMP=y 187CONFIG_SMP=y
188CONFIG_X86_FIND_SMP_CONFIG=y
189CONFIG_X86_MPPARSE=y
178CONFIG_X86_PC=y 190CONFIG_X86_PC=y
179# CONFIG_X86_ELAN is not set 191# CONFIG_X86_ELAN is not set
180# CONFIG_X86_VOYAGER is not set 192# CONFIG_X86_VOYAGER is not set
181# CONFIG_X86_NUMAQ is not set
182# CONFIG_X86_SUMMIT is not set
183# CONFIG_X86_BIGSMP is not set
184# CONFIG_X86_VISWS is not set
185# CONFIG_X86_GENERICARCH is not set 193# CONFIG_X86_GENERICARCH is not set
186# CONFIG_X86_ES7000 is not set
187# CONFIG_X86_RDC321X is not set
188# CONFIG_X86_VSMP is not set 194# CONFIG_X86_VSMP is not set
189# CONFIG_PARAVIRT_GUEST is not set 195# CONFIG_PARAVIRT_GUEST is not set
190CONFIG_MEMTEST_BOOTPARAM=y 196# CONFIG_MEMTEST is not set
191CONFIG_MEMTEST_BOOTPARAM_VALUE=0
192# CONFIG_M386 is not set 197# CONFIG_M386 is not set
193# CONFIG_M486 is not set 198# CONFIG_M486 is not set
194# CONFIG_M586 is not set 199# CONFIG_M586 is not set
@@ -213,18 +218,16 @@ CONFIG_MEMTEST_BOOTPARAM_VALUE=0
213# CONFIG_MVIAC3_2 is not set 218# CONFIG_MVIAC3_2 is not set
214# CONFIG_MVIAC7 is not set 219# CONFIG_MVIAC7 is not set
215# CONFIG_MPSC is not set 220# CONFIG_MPSC is not set
216CONFIG_MCORE2=y 221# CONFIG_MCORE2 is not set
217# CONFIG_GENERIC_CPU is not set 222CONFIG_GENERIC_CPU=y
218CONFIG_X86_CPU=y 223CONFIG_X86_CPU=y
219CONFIG_X86_L1_CACHE_BYTES=64 224CONFIG_X86_L1_CACHE_BYTES=128
220CONFIG_X86_INTERNODE_CACHE_BYTES=64 225CONFIG_X86_INTERNODE_CACHE_BYTES=128
221CONFIG_X86_CMPXCHG=y 226CONFIG_X86_CMPXCHG=y
222CONFIG_X86_L1_CACHE_SHIFT=6 227CONFIG_X86_L1_CACHE_SHIFT=7
223CONFIG_X86_GOOD_APIC=y 228CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INTEL_USERCOPY=y
225CONFIG_X86_USE_PPRO_CHECKSUM=y
226CONFIG_X86_P6_NOP=y
227CONFIG_X86_TSC=y 229CONFIG_X86_TSC=y
230CONFIG_X86_CMPXCHG64=y
228CONFIG_X86_CMOV=y 231CONFIG_X86_CMOV=y
229CONFIG_X86_MINIMUM_CPU_FAMILY=64 232CONFIG_X86_MINIMUM_CPU_FAMILY=64
230CONFIG_X86_DEBUGCTLMSR=y 233CONFIG_X86_DEBUGCTLMSR=y
@@ -234,10 +237,11 @@ CONFIG_DMI=y
234CONFIG_GART_IOMMU=y 237CONFIG_GART_IOMMU=y
235CONFIG_CALGARY_IOMMU=y 238CONFIG_CALGARY_IOMMU=y
236CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y 239CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
240CONFIG_AMD_IOMMU=y
237CONFIG_SWIOTLB=y 241CONFIG_SWIOTLB=y
238CONFIG_IOMMU_HELPER=y 242CONFIG_IOMMU_HELPER=y
239CONFIG_NR_CPUS=4 243CONFIG_NR_CPUS=64
240# CONFIG_SCHED_SMT is not set 244CONFIG_SCHED_SMT=y
241CONFIG_SCHED_MC=y 245CONFIG_SCHED_MC=y
242# CONFIG_PREEMPT_NONE is not set 246# CONFIG_PREEMPT_NONE is not set
243CONFIG_PREEMPT_VOLUNTARY=y 247CONFIG_PREEMPT_VOLUNTARY=y
@@ -246,7 +250,8 @@ CONFIG_X86_LOCAL_APIC=y
246CONFIG_X86_IO_APIC=y 250CONFIG_X86_IO_APIC=y
247# CONFIG_X86_MCE is not set 251# CONFIG_X86_MCE is not set
248# CONFIG_I8K is not set 252# CONFIG_I8K is not set
249# CONFIG_MICROCODE is not set 253CONFIG_MICROCODE=y
254CONFIG_MICROCODE_OLD_INTERFACE=y
250CONFIG_X86_MSR=y 255CONFIG_X86_MSR=y
251CONFIG_X86_CPUID=y 256CONFIG_X86_CPUID=y
252CONFIG_NUMA=y 257CONFIG_NUMA=y
@@ -281,7 +286,8 @@ CONFIG_ZONE_DMA_FLAG=1
281CONFIG_BOUNCE=y 286CONFIG_BOUNCE=y
282CONFIG_VIRT_TO_BUS=y 287CONFIG_VIRT_TO_BUS=y
283CONFIG_MTRR=y 288CONFIG_MTRR=y
284# CONFIG_X86_PAT is not set 289# CONFIG_MTRR_SANITIZER is not set
290CONFIG_X86_PAT=y
285CONFIG_EFI=y 291CONFIG_EFI=y
286CONFIG_SECCOMP=y 292CONFIG_SECCOMP=y
287# CONFIG_HZ_100 is not set 293# CONFIG_HZ_100 is not set
@@ -313,6 +319,7 @@ CONFIG_PM_TRACE_RTC=y
313CONFIG_PM_SLEEP_SMP=y 319CONFIG_PM_SLEEP_SMP=y
314CONFIG_PM_SLEEP=y 320CONFIG_PM_SLEEP=y
315CONFIG_SUSPEND=y 321CONFIG_SUSPEND=y
322# CONFIG_PM_TEST_SUSPEND is not set
316CONFIG_SUSPEND_FREEZER=y 323CONFIG_SUSPEND_FREEZER=y
317CONFIG_HIBERNATION=y 324CONFIG_HIBERNATION=y
318CONFIG_PM_STD_PARTITION="" 325CONFIG_PM_STD_PARTITION=""
@@ -339,6 +346,7 @@ CONFIG_ACPI_NUMA=y
339CONFIG_ACPI_BLACKLIST_YEAR=0 346CONFIG_ACPI_BLACKLIST_YEAR=0
340# CONFIG_ACPI_DEBUG is not set 347# CONFIG_ACPI_DEBUG is not set
341CONFIG_ACPI_EC=y 348CONFIG_ACPI_EC=y
349# CONFIG_ACPI_PCI_SLOT is not set
342CONFIG_ACPI_POWER=y 350CONFIG_ACPI_POWER=y
343CONFIG_ACPI_SYSTEM=y 351CONFIG_ACPI_SYSTEM=y
344CONFIG_X86_PM_TIMER=y 352CONFIG_X86_PM_TIMER=y
@@ -437,10 +445,6 @@ CONFIG_IA32_EMULATION=y
437CONFIG_COMPAT=y 445CONFIG_COMPAT=y
438CONFIG_COMPAT_FOR_U64_ALIGNMENT=y 446CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
439CONFIG_SYSVIPC_COMPAT=y 447CONFIG_SYSVIPC_COMPAT=y
440
441#
442# Networking
443#
444CONFIG_NET=y 448CONFIG_NET=y
445 449
446# 450#
@@ -464,7 +468,10 @@ CONFIG_IP_FIB_HASH=y
464CONFIG_IP_MULTIPLE_TABLES=y 468CONFIG_IP_MULTIPLE_TABLES=y
465CONFIG_IP_ROUTE_MULTIPATH=y 469CONFIG_IP_ROUTE_MULTIPATH=y
466CONFIG_IP_ROUTE_VERBOSE=y 470CONFIG_IP_ROUTE_VERBOSE=y
467# CONFIG_IP_PNP is not set 471CONFIG_IP_PNP=y
472CONFIG_IP_PNP_DHCP=y
473CONFIG_IP_PNP_BOOTP=y
474CONFIG_IP_PNP_RARP=y
468# CONFIG_NET_IPIP is not set 475# CONFIG_NET_IPIP is not set
469# CONFIG_NET_IPGRE is not set 476# CONFIG_NET_IPGRE is not set
470CONFIG_IP_MROUTE=y 477CONFIG_IP_MROUTE=y
@@ -607,7 +614,6 @@ CONFIG_NET_SCHED=y
607# CONFIG_NET_SCH_HTB is not set 614# CONFIG_NET_SCH_HTB is not set
608# CONFIG_NET_SCH_HFSC is not set 615# CONFIG_NET_SCH_HFSC is not set
609# CONFIG_NET_SCH_PRIO is not set 616# CONFIG_NET_SCH_PRIO is not set
610# CONFIG_NET_SCH_RR is not set
611# CONFIG_NET_SCH_RED is not set 617# CONFIG_NET_SCH_RED is not set
612# CONFIG_NET_SCH_SFQ is not set 618# CONFIG_NET_SCH_SFQ is not set
613# CONFIG_NET_SCH_TEQL is not set 619# CONFIG_NET_SCH_TEQL is not set
@@ -669,28 +675,19 @@ CONFIG_FIB_RULES=y
669CONFIG_CFG80211=y 675CONFIG_CFG80211=y
670CONFIG_NL80211=y 676CONFIG_NL80211=y
671CONFIG_WIRELESS_EXT=y 677CONFIG_WIRELESS_EXT=y
678CONFIG_WIRELESS_EXT_SYSFS=y
672CONFIG_MAC80211=y 679CONFIG_MAC80211=y
673 680
674# 681#
675# Rate control algorithm selection 682# Rate control algorithm selection
676# 683#
684CONFIG_MAC80211_RC_PID=y
677CONFIG_MAC80211_RC_DEFAULT_PID=y 685CONFIG_MAC80211_RC_DEFAULT_PID=y
678# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
679
680#
681# Selecting 'y' for an algorithm will
682#
683
684#
685# build the algorithm into mac80211.
686#
687CONFIG_MAC80211_RC_DEFAULT="pid" 686CONFIG_MAC80211_RC_DEFAULT="pid"
688CONFIG_MAC80211_RC_PID=y
689# CONFIG_MAC80211_MESH is not set 687# CONFIG_MAC80211_MESH is not set
690CONFIG_MAC80211_LEDS=y 688CONFIG_MAC80211_LEDS=y
691# CONFIG_MAC80211_DEBUGFS is not set 689# CONFIG_MAC80211_DEBUGFS is not set
692# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 690# CONFIG_MAC80211_DEBUG_MENU is not set
693# CONFIG_MAC80211_DEBUG is not set
694# CONFIG_IEEE80211 is not set 691# CONFIG_IEEE80211 is not set
695# CONFIG_RFKILL is not set 692# CONFIG_RFKILL is not set
696# CONFIG_NET_9P is not set 693# CONFIG_NET_9P is not set
@@ -706,6 +703,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
706CONFIG_STANDALONE=y 703CONFIG_STANDALONE=y
707CONFIG_PREVENT_FIRMWARE_BUILD=y 704CONFIG_PREVENT_FIRMWARE_BUILD=y
708CONFIG_FW_LOADER=y 705CONFIG_FW_LOADER=y
706CONFIG_FIRMWARE_IN_KERNEL=y
707CONFIG_EXTRA_FIRMWARE=""
709# CONFIG_DEBUG_DRIVER is not set 708# CONFIG_DEBUG_DRIVER is not set
710CONFIG_DEBUG_DEVRES=y 709CONFIG_DEBUG_DEVRES=y
711# CONFIG_SYS_HYPERVISOR is not set 710# CONFIG_SYS_HYPERVISOR is not set
@@ -738,6 +737,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
738# CONFIG_BLK_DEV_XIP is not set 737# CONFIG_BLK_DEV_XIP is not set
739# CONFIG_CDROM_PKTCDVD is not set 738# CONFIG_CDROM_PKTCDVD is not set
740# CONFIG_ATA_OVER_ETH is not set 739# CONFIG_ATA_OVER_ETH is not set
740# CONFIG_BLK_DEV_HD is not set
741CONFIG_MISC_DEVICES=y 741CONFIG_MISC_DEVICES=y
742# CONFIG_IBM_ASM is not set 742# CONFIG_IBM_ASM is not set
743# CONFIG_PHANTOM is not set 743# CONFIG_PHANTOM is not set
@@ -748,10 +748,14 @@ CONFIG_MISC_DEVICES=y
748# CONFIG_ASUS_LAPTOP is not set 748# CONFIG_ASUS_LAPTOP is not set
749# CONFIG_FUJITSU_LAPTOP is not set 749# CONFIG_FUJITSU_LAPTOP is not set
750# CONFIG_MSI_LAPTOP is not set 750# CONFIG_MSI_LAPTOP is not set
751# CONFIG_COMPAL_LAPTOP is not set
751# CONFIG_SONY_LAPTOP is not set 752# CONFIG_SONY_LAPTOP is not set
752# CONFIG_THINKPAD_ACPI is not set 753# CONFIG_THINKPAD_ACPI is not set
753# CONFIG_INTEL_MENLOW is not set 754# CONFIG_INTEL_MENLOW is not set
754# CONFIG_ENCLOSURE_SERVICES is not set 755# CONFIG_ENCLOSURE_SERVICES is not set
756# CONFIG_SGI_XP is not set
757# CONFIG_HP_ILO is not set
758# CONFIG_SGI_GRU is not set
755CONFIG_HAVE_IDE=y 759CONFIG_HAVE_IDE=y
756# CONFIG_IDE is not set 760# CONFIG_IDE is not set
757 761
@@ -790,12 +794,13 @@ CONFIG_SCSI_WAIT_SCAN=m
790# 794#
791CONFIG_SCSI_SPI_ATTRS=y 795CONFIG_SCSI_SPI_ATTRS=y
792# CONFIG_SCSI_FC_ATTRS is not set 796# CONFIG_SCSI_FC_ATTRS is not set
793# CONFIG_SCSI_ISCSI_ATTRS is not set 797CONFIG_SCSI_ISCSI_ATTRS=y
794# CONFIG_SCSI_SAS_ATTRS is not set 798# CONFIG_SCSI_SAS_ATTRS is not set
795# CONFIG_SCSI_SAS_LIBSAS is not set 799# CONFIG_SCSI_SAS_LIBSAS is not set
796# CONFIG_SCSI_SRP_ATTRS is not set 800# CONFIG_SCSI_SRP_ATTRS is not set
797# CONFIG_SCSI_LOWLEVEL is not set 801# CONFIG_SCSI_LOWLEVEL is not set
798# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 802# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
803# CONFIG_SCSI_DH is not set
799CONFIG_ATA=y 804CONFIG_ATA=y
800# CONFIG_ATA_NONSTANDARD is not set 805# CONFIG_ATA_NONSTANDARD is not set
801CONFIG_ATA_ACPI=y 806CONFIG_ATA_ACPI=y
@@ -857,6 +862,7 @@ CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_SIS is not set 862# CONFIG_PATA_SIS is not set
858# CONFIG_PATA_VIA is not set 863# CONFIG_PATA_VIA is not set
859# CONFIG_PATA_WINBOND is not set 864# CONFIG_PATA_WINBOND is not set
865CONFIG_PATA_SCH=y
860CONFIG_MD=y 866CONFIG_MD=y
861CONFIG_BLK_DEV_MD=y 867CONFIG_BLK_DEV_MD=y
862# CONFIG_MD_LINEAR is not set 868# CONFIG_MD_LINEAR is not set
@@ -880,13 +886,16 @@ CONFIG_DM_ZERO=y
880# 886#
881# IEEE 1394 (FireWire) support 887# IEEE 1394 (FireWire) support
882# 888#
889
890#
891# Enable only one of the two stacks, unless you know what you are doing
892#
883# CONFIG_FIREWIRE is not set 893# CONFIG_FIREWIRE is not set
884# CONFIG_IEEE1394 is not set 894# CONFIG_IEEE1394 is not set
885# CONFIG_I2O is not set 895# CONFIG_I2O is not set
886CONFIG_MACINTOSH_DRIVERS=y 896CONFIG_MACINTOSH_DRIVERS=y
887CONFIG_MAC_EMUMOUSEBTN=y 897CONFIG_MAC_EMUMOUSEBTN=y
888CONFIG_NETDEVICES=y 898CONFIG_NETDEVICES=y
889# CONFIG_NETDEVICES_MULTIQUEUE is not set
890# CONFIG_IFB is not set 899# CONFIG_IFB is not set
891# CONFIG_DUMMY is not set 900# CONFIG_DUMMY is not set
892# CONFIG_BONDING is not set 901# CONFIG_BONDING is not set
@@ -896,7 +905,23 @@ CONFIG_NETDEVICES=y
896# CONFIG_VETH is not set 905# CONFIG_VETH is not set
897# CONFIG_NET_SB1000 is not set 906# CONFIG_NET_SB1000 is not set
898# CONFIG_ARCNET is not set 907# CONFIG_ARCNET is not set
899# CONFIG_PHYLIB is not set 908CONFIG_PHYLIB=y
909
910#
911# MII PHY device drivers
912#
913# CONFIG_MARVELL_PHY is not set
914# CONFIG_DAVICOM_PHY is not set
915# CONFIG_QSEMI_PHY is not set
916# CONFIG_LXT_PHY is not set
917# CONFIG_CICADA_PHY is not set
918# CONFIG_VITESSE_PHY is not set
919# CONFIG_SMSC_PHY is not set
920# CONFIG_BROADCOM_PHY is not set
921# CONFIG_ICPLUS_PHY is not set
922# CONFIG_REALTEK_PHY is not set
923# CONFIG_FIXED_PHY is not set
924# CONFIG_MDIO_BITBANG is not set
900CONFIG_NET_ETHERNET=y 925CONFIG_NET_ETHERNET=y
901CONFIG_MII=y 926CONFIG_MII=y
902# CONFIG_HAPPYMEAL is not set 927# CONFIG_HAPPYMEAL is not set
@@ -940,16 +965,15 @@ CONFIG_8139TOO_PIO=y
940# CONFIG_SIS900 is not set 965# CONFIG_SIS900 is not set
941# CONFIG_EPIC100 is not set 966# CONFIG_EPIC100 is not set
942# CONFIG_SUNDANCE is not set 967# CONFIG_SUNDANCE is not set
968# CONFIG_TLAN is not set
943# CONFIG_VIA_RHINE is not set 969# CONFIG_VIA_RHINE is not set
944# CONFIG_SC92031 is not set 970# CONFIG_SC92031 is not set
945CONFIG_NETDEV_1000=y 971CONFIG_NETDEV_1000=y
946# CONFIG_ACENIC is not set 972# CONFIG_ACENIC is not set
947# CONFIG_DL2K is not set 973# CONFIG_DL2K is not set
948CONFIG_E1000=y 974CONFIG_E1000=y
949# CONFIG_E1000_NAPI is not set
950# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 975# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
951# CONFIG_E1000E is not set 976# CONFIG_E1000E is not set
952# CONFIG_E1000E_ENABLED is not set
953# CONFIG_IP1000 is not set 977# CONFIG_IP1000 is not set
954# CONFIG_IGB is not set 978# CONFIG_IGB is not set
955# CONFIG_NS83820 is not set 979# CONFIG_NS83820 is not set
@@ -965,6 +989,7 @@ CONFIG_TIGON3=y
965# CONFIG_BNX2 is not set 989# CONFIG_BNX2 is not set
966# CONFIG_QLA3XXX is not set 990# CONFIG_QLA3XXX is not set
967# CONFIG_ATL1 is not set 991# CONFIG_ATL1 is not set
992# CONFIG_ATL1E is not set
968CONFIG_NETDEV_10000=y 993CONFIG_NETDEV_10000=y
969# CONFIG_CHELSIO_T1 is not set 994# CONFIG_CHELSIO_T1 is not set
970# CONFIG_CHELSIO_T3 is not set 995# CONFIG_CHELSIO_T3 is not set
@@ -1003,13 +1028,14 @@ CONFIG_WLAN_80211=y
1003# CONFIG_RTL8180 is not set 1028# CONFIG_RTL8180 is not set
1004# CONFIG_RTL8187 is not set 1029# CONFIG_RTL8187 is not set
1005# CONFIG_ADM8211 is not set 1030# CONFIG_ADM8211 is not set
1031# CONFIG_MAC80211_HWSIM is not set
1006# CONFIG_P54_COMMON is not set 1032# CONFIG_P54_COMMON is not set
1007CONFIG_ATH5K=y 1033CONFIG_ATH5K=y
1008# CONFIG_ATH5K_DEBUG is not set 1034# CONFIG_ATH5K_DEBUG is not set
1009# CONFIG_IWLWIFI is not set 1035# CONFIG_ATH9K is not set
1010# CONFIG_IWLCORE is not set 1036# CONFIG_IWLCORE is not set
1011# CONFIG_IWLWIFI_LEDS is not set 1037# CONFIG_IWLWIFI_LEDS is not set
1012# CONFIG_IWL4965 is not set 1038# CONFIG_IWLAGN is not set
1013# CONFIG_IWL3945 is not set 1039# CONFIG_IWL3945 is not set
1014# CONFIG_HOSTAP is not set 1040# CONFIG_HOSTAP is not set
1015# CONFIG_B43 is not set 1041# CONFIG_B43 is not set
@@ -1088,6 +1114,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1088# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1114# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1089# CONFIG_MOUSE_SERIAL is not set 1115# CONFIG_MOUSE_SERIAL is not set
1090# CONFIG_MOUSE_APPLETOUCH is not set 1116# CONFIG_MOUSE_APPLETOUCH is not set
1117# CONFIG_MOUSE_BCM5974 is not set
1091# CONFIG_MOUSE_VSXXXAA is not set 1118# CONFIG_MOUSE_VSXXXAA is not set
1092CONFIG_INPUT_JOYSTICK=y 1119CONFIG_INPUT_JOYSTICK=y
1093# CONFIG_JOYSTICK_ANALOG is not set 1120# CONFIG_JOYSTICK_ANALOG is not set
@@ -1122,12 +1149,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1122# CONFIG_TOUCHSCREEN_GUNZE is not set 1149# CONFIG_TOUCHSCREEN_GUNZE is not set
1123# CONFIG_TOUCHSCREEN_ELO is not set 1150# CONFIG_TOUCHSCREEN_ELO is not set
1124# CONFIG_TOUCHSCREEN_MTOUCH is not set 1151# CONFIG_TOUCHSCREEN_MTOUCH is not set
1152# CONFIG_TOUCHSCREEN_INEXIO is not set
1125# CONFIG_TOUCHSCREEN_MK712 is not set 1153# CONFIG_TOUCHSCREEN_MK712 is not set
1126# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1154# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1127# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1155# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1128# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1156# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1129# CONFIG_TOUCHSCREEN_UCB1400 is not set 1157# CONFIG_TOUCHSCREEN_UCB1400 is not set
1130# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1158# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1159# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1131CONFIG_INPUT_MISC=y 1160CONFIG_INPUT_MISC=y
1132# CONFIG_INPUT_PCSPKR is not set 1161# CONFIG_INPUT_PCSPKR is not set
1133# CONFIG_INPUT_APANEL is not set 1162# CONFIG_INPUT_APANEL is not set
@@ -1155,6 +1184,7 @@ CONFIG_SERIO_LIBPS2=y
1155# Character devices 1184# Character devices
1156# 1185#
1157CONFIG_VT=y 1186CONFIG_VT=y
1187CONFIG_CONSOLE_TRANSLATIONS=y
1158CONFIG_VT_CONSOLE=y 1188CONFIG_VT_CONSOLE=y
1159CONFIG_HW_CONSOLE=y 1189CONFIG_HW_CONSOLE=y
1160CONFIG_VT_HW_CONSOLE_BINDING=y 1190CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1222,7 +1252,6 @@ CONFIG_NVRAM=y
1222# CONFIG_PC8736x_GPIO is not set 1252# CONFIG_PC8736x_GPIO is not set
1223# CONFIG_RAW_DRIVER is not set 1253# CONFIG_RAW_DRIVER is not set
1224CONFIG_HPET=y 1254CONFIG_HPET=y
1225# CONFIG_HPET_RTC_IRQ is not set
1226# CONFIG_HPET_MMAP is not set 1255# CONFIG_HPET_MMAP is not set
1227# CONFIG_HANGCHECK_TIMER is not set 1256# CONFIG_HANGCHECK_TIMER is not set
1228# CONFIG_TCG_TPM is not set 1257# CONFIG_TCG_TPM is not set
@@ -1231,42 +1260,63 @@ CONFIG_DEVPORT=y
1231CONFIG_I2C=y 1260CONFIG_I2C=y
1232CONFIG_I2C_BOARDINFO=y 1261CONFIG_I2C_BOARDINFO=y
1233# CONFIG_I2C_CHARDEV is not set 1262# CONFIG_I2C_CHARDEV is not set
1263CONFIG_I2C_HELPER_AUTO=y
1234 1264
1235# 1265#
1236# I2C Hardware Bus support 1266# I2C Hardware Bus support
1237# 1267#
1268
1269#
1270# PC SMBus host controller drivers
1271#
1238# CONFIG_I2C_ALI1535 is not set 1272# CONFIG_I2C_ALI1535 is not set
1239# CONFIG_I2C_ALI1563 is not set 1273# CONFIG_I2C_ALI1563 is not set
1240# CONFIG_I2C_ALI15X3 is not set 1274# CONFIG_I2C_ALI15X3 is not set
1241# CONFIG_I2C_AMD756 is not set 1275# CONFIG_I2C_AMD756 is not set
1242# CONFIG_I2C_AMD8111 is not set 1276# CONFIG_I2C_AMD8111 is not set
1243CONFIG_I2C_I801=y 1277CONFIG_I2C_I801=y
1244# CONFIG_I2C_I810 is not set 1278# CONFIG_I2C_ISCH is not set
1245# CONFIG_I2C_PIIX4 is not set 1279# CONFIG_I2C_PIIX4 is not set
1246# CONFIG_I2C_NFORCE2 is not set 1280# CONFIG_I2C_NFORCE2 is not set
1247# CONFIG_I2C_OCORES is not set
1248# CONFIG_I2C_PARPORT_LIGHT is not set
1249# CONFIG_I2C_PROSAVAGE is not set
1250# CONFIG_I2C_SAVAGE4 is not set
1251# CONFIG_I2C_SIMTEC is not set
1252# CONFIG_I2C_SIS5595 is not set 1281# CONFIG_I2C_SIS5595 is not set
1253# CONFIG_I2C_SIS630 is not set 1282# CONFIG_I2C_SIS630 is not set
1254# CONFIG_I2C_SIS96X is not set 1283# CONFIG_I2C_SIS96X is not set
1255# CONFIG_I2C_TAOS_EVM is not set
1256# CONFIG_I2C_STUB is not set
1257# CONFIG_I2C_TINY_USB is not set
1258# CONFIG_I2C_VIA is not set 1284# CONFIG_I2C_VIA is not set
1259# CONFIG_I2C_VIAPRO is not set 1285# CONFIG_I2C_VIAPRO is not set
1286
1287#
1288# I2C system bus drivers (mostly embedded / system-on-chip)
1289#
1290# CONFIG_I2C_OCORES is not set
1291# CONFIG_I2C_SIMTEC is not set
1292
1293#
1294# External I2C/SMBus adapter drivers
1295#
1296# CONFIG_I2C_PARPORT_LIGHT is not set
1297# CONFIG_I2C_TAOS_EVM is not set
1298# CONFIG_I2C_TINY_USB is not set
1299
1300#
1301# Graphics adapter I2C/DDC channel drivers
1302#
1260# CONFIG_I2C_VOODOO3 is not set 1303# CONFIG_I2C_VOODOO3 is not set
1304
1305#
1306# Other I2C/SMBus bus drivers
1307#
1261# CONFIG_I2C_PCA_PLATFORM is not set 1308# CONFIG_I2C_PCA_PLATFORM is not set
1309# CONFIG_I2C_STUB is not set
1262 1310
1263# 1311#
1264# Miscellaneous I2C Chip support 1312# Miscellaneous I2C Chip support
1265# 1313#
1266# CONFIG_DS1682 is not set 1314# CONFIG_DS1682 is not set
1315# CONFIG_AT24 is not set
1267# CONFIG_SENSORS_EEPROM is not set 1316# CONFIG_SENSORS_EEPROM is not set
1268# CONFIG_SENSORS_PCF8574 is not set 1317# CONFIG_SENSORS_PCF8574 is not set
1269# CONFIG_PCF8575 is not set 1318# CONFIG_PCF8575 is not set
1319# CONFIG_SENSORS_PCA9539 is not set
1270# CONFIG_SENSORS_PCF8591 is not set 1320# CONFIG_SENSORS_PCF8591 is not set
1271# CONFIG_SENSORS_MAX6875 is not set 1321# CONFIG_SENSORS_MAX6875 is not set
1272# CONFIG_SENSORS_TSL2550 is not set 1322# CONFIG_SENSORS_TSL2550 is not set
@@ -1275,6 +1325,8 @@ CONFIG_I2C_I801=y
1275# CONFIG_I2C_DEBUG_BUS is not set 1325# CONFIG_I2C_DEBUG_BUS is not set
1276# CONFIG_I2C_DEBUG_CHIP is not set 1326# CONFIG_I2C_DEBUG_CHIP is not set
1277# CONFIG_SPI is not set 1327# CONFIG_SPI is not set
1328CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1329# CONFIG_GPIOLIB is not set
1278# CONFIG_W1 is not set 1330# CONFIG_W1 is not set
1279CONFIG_POWER_SUPPLY=y 1331CONFIG_POWER_SUPPLY=y
1280# CONFIG_POWER_SUPPLY_DEBUG is not set 1332# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1335,8 +1387,10 @@ CONFIG_SSB_POSSIBLE=y
1335# 1387#
1336# Multifunction device drivers 1388# Multifunction device drivers
1337# 1389#
1390# CONFIG_MFD_CORE is not set
1338# CONFIG_MFD_SM501 is not set 1391# CONFIG_MFD_SM501 is not set
1339# CONFIG_HTC_PASIC3 is not set 1392# CONFIG_HTC_PASIC3 is not set
1393# CONFIG_MFD_TMIO is not set
1340 1394
1341# 1395#
1342# Multimedia devices 1396# Multimedia devices
@@ -1347,6 +1401,7 @@ CONFIG_SSB_POSSIBLE=y
1347# 1401#
1348# CONFIG_VIDEO_DEV is not set 1402# CONFIG_VIDEO_DEV is not set
1349# CONFIG_DVB_CORE is not set 1403# CONFIG_DVB_CORE is not set
1404# CONFIG_VIDEO_MEDIA is not set
1350 1405
1351# 1406#
1352# Multimedia drivers 1407# Multimedia drivers
@@ -1387,7 +1442,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1387# CONFIG_FB_SYS_IMAGEBLIT is not set 1442# CONFIG_FB_SYS_IMAGEBLIT is not set
1388# CONFIG_FB_FOREIGN_ENDIAN is not set 1443# CONFIG_FB_FOREIGN_ENDIAN is not set
1389# CONFIG_FB_SYS_FOPS is not set 1444# CONFIG_FB_SYS_FOPS is not set
1390CONFIG_FB_DEFERRED_IO=y
1391# CONFIG_FB_SVGALIB is not set 1445# CONFIG_FB_SVGALIB is not set
1392# CONFIG_FB_MACMODES is not set 1446# CONFIG_FB_MACMODES is not set
1393# CONFIG_FB_BACKLIGHT is not set 1447# CONFIG_FB_BACKLIGHT is not set
@@ -1430,6 +1484,7 @@ CONFIG_FB_EFI=y
1430# CONFIG_FB_TRIDENT is not set 1484# CONFIG_FB_TRIDENT is not set
1431# CONFIG_FB_ARK is not set 1485# CONFIG_FB_ARK is not set
1432# CONFIG_FB_PM3 is not set 1486# CONFIG_FB_PM3 is not set
1487# CONFIG_FB_CARMINE is not set
1433# CONFIG_FB_GEODE is not set 1488# CONFIG_FB_GEODE is not set
1434# CONFIG_FB_VIRTUAL is not set 1489# CONFIG_FB_VIRTUAL is not set
1435CONFIG_BACKLIGHT_LCD_SUPPORT=y 1490CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1437,6 +1492,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1437CONFIG_BACKLIGHT_CLASS_DEVICE=y 1492CONFIG_BACKLIGHT_CLASS_DEVICE=y
1438# CONFIG_BACKLIGHT_CORGI is not set 1493# CONFIG_BACKLIGHT_CORGI is not set
1439# CONFIG_BACKLIGHT_PROGEAR is not set 1494# CONFIG_BACKLIGHT_PROGEAR is not set
1495# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1440 1496
1441# 1497#
1442# Display device support 1498# Display device support
@@ -1449,22 +1505,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1449CONFIG_VGA_CONSOLE=y 1505CONFIG_VGA_CONSOLE=y
1450CONFIG_VGACON_SOFT_SCROLLBACK=y 1506CONFIG_VGACON_SOFT_SCROLLBACK=y
1451CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1507CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1452CONFIG_VIDEO_SELECT=y
1453CONFIG_DUMMY_CONSOLE=y 1508CONFIG_DUMMY_CONSOLE=y
1454# CONFIG_FRAMEBUFFER_CONSOLE is not set 1509# CONFIG_FRAMEBUFFER_CONSOLE is not set
1455CONFIG_LOGO=y 1510CONFIG_LOGO=y
1456# CONFIG_LOGO_LINUX_MONO is not set 1511# CONFIG_LOGO_LINUX_MONO is not set
1457# CONFIG_LOGO_LINUX_VGA16 is not set 1512# CONFIG_LOGO_LINUX_VGA16 is not set
1458CONFIG_LOGO_LINUX_CLUT224=y 1513CONFIG_LOGO_LINUX_CLUT224=y
1459
1460#
1461# Sound
1462#
1463CONFIG_SOUND=y 1514CONFIG_SOUND=y
1464
1465#
1466# Advanced Linux Sound Architecture
1467#
1468CONFIG_SND=y 1515CONFIG_SND=y
1469CONFIG_SND_TIMER=y 1516CONFIG_SND_TIMER=y
1470CONFIG_SND_PCM=y 1517CONFIG_SND_PCM=y
@@ -1482,20 +1529,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1482# CONFIG_SND_VERBOSE_PRINTK is not set 1529# CONFIG_SND_VERBOSE_PRINTK is not set
1483# CONFIG_SND_DEBUG is not set 1530# CONFIG_SND_DEBUG is not set
1484CONFIG_SND_VMASTER=y 1531CONFIG_SND_VMASTER=y
1485 1532CONFIG_SND_DRIVERS=y
1486#
1487# Generic devices
1488#
1489# CONFIG_SND_PCSP is not set 1533# CONFIG_SND_PCSP is not set
1490# CONFIG_SND_DUMMY is not set 1534# CONFIG_SND_DUMMY is not set
1491# CONFIG_SND_VIRMIDI is not set 1535# CONFIG_SND_VIRMIDI is not set
1492# CONFIG_SND_MTPAV is not set 1536# CONFIG_SND_MTPAV is not set
1493# CONFIG_SND_SERIAL_U16550 is not set 1537# CONFIG_SND_SERIAL_U16550 is not set
1494# CONFIG_SND_MPU401 is not set 1538# CONFIG_SND_MPU401 is not set
1495 1539CONFIG_SND_PCI=y
1496#
1497# PCI devices
1498#
1499# CONFIG_SND_AD1889 is not set 1540# CONFIG_SND_AD1889 is not set
1500# CONFIG_SND_ALS300 is not set 1541# CONFIG_SND_ALS300 is not set
1501# CONFIG_SND_ALS4000 is not set 1542# CONFIG_SND_ALS4000 is not set
@@ -1568,36 +1609,14 @@ CONFIG_SND_HDA_GENERIC=y
1568# CONFIG_SND_VIRTUOSO is not set 1609# CONFIG_SND_VIRTUOSO is not set
1569# CONFIG_SND_VX222 is not set 1610# CONFIG_SND_VX222 is not set
1570# CONFIG_SND_YMFPCI is not set 1611# CONFIG_SND_YMFPCI is not set
1571 1612CONFIG_SND_USB=y
1572#
1573# USB devices
1574#
1575# CONFIG_SND_USB_AUDIO is not set 1613# CONFIG_SND_USB_AUDIO is not set
1576# CONFIG_SND_USB_USX2Y is not set 1614# CONFIG_SND_USB_USX2Y is not set
1577# CONFIG_SND_USB_CAIAQ is not set 1615# CONFIG_SND_USB_CAIAQ is not set
1578 1616CONFIG_SND_PCMCIA=y
1579#
1580# PCMCIA devices
1581#
1582# CONFIG_SND_VXPOCKET is not set 1617# CONFIG_SND_VXPOCKET is not set
1583# CONFIG_SND_PDAUDIOCF is not set 1618# CONFIG_SND_PDAUDIOCF is not set
1584
1585#
1586# System on Chip audio support
1587#
1588# CONFIG_SND_SOC is not set 1619# CONFIG_SND_SOC is not set
1589
1590#
1591# ALSA SoC audio for Freescale SOCs
1592#
1593
1594#
1595# SoC Audio for the Texas Instruments OMAP
1596#
1597
1598#
1599# Open Sound System
1600#
1601# CONFIG_SOUND_PRIME is not set 1620# CONFIG_SOUND_PRIME is not set
1602CONFIG_HID_SUPPORT=y 1621CONFIG_HID_SUPPORT=y
1603CONFIG_HID=y 1622CONFIG_HID=y
@@ -1633,6 +1652,7 @@ CONFIG_USB_DEVICEFS=y
1633# CONFIG_USB_DYNAMIC_MINORS is not set 1652# CONFIG_USB_DYNAMIC_MINORS is not set
1634CONFIG_USB_SUSPEND=y 1653CONFIG_USB_SUSPEND=y
1635# CONFIG_USB_OTG is not set 1654# CONFIG_USB_OTG is not set
1655CONFIG_USB_MON=y
1636 1656
1637# 1657#
1638# USB Host Controller Drivers 1658# USB Host Controller Drivers
@@ -1656,6 +1676,7 @@ CONFIG_USB_UHCI_HCD=y
1656# 1676#
1657# CONFIG_USB_ACM is not set 1677# CONFIG_USB_ACM is not set
1658CONFIG_USB_PRINTER=y 1678CONFIG_USB_PRINTER=y
1679# CONFIG_USB_WDM is not set
1659 1680
1660# 1681#
1661# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1682# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1677,6 +1698,7 @@ CONFIG_USB_STORAGE=y
1677# CONFIG_USB_STORAGE_ALAUDA is not set 1698# CONFIG_USB_STORAGE_ALAUDA is not set
1678# CONFIG_USB_STORAGE_ONETOUCH is not set 1699# CONFIG_USB_STORAGE_ONETOUCH is not set
1679# CONFIG_USB_STORAGE_KARMA is not set 1700# CONFIG_USB_STORAGE_KARMA is not set
1701# CONFIG_USB_STORAGE_SIERRA is not set
1680# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1702# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1681CONFIG_USB_LIBUSUAL=y 1703CONFIG_USB_LIBUSUAL=y
1682 1704
@@ -1685,7 +1707,6 @@ CONFIG_USB_LIBUSUAL=y
1685# 1707#
1686# CONFIG_USB_MDC800 is not set 1708# CONFIG_USB_MDC800 is not set
1687# CONFIG_USB_MICROTEK is not set 1709# CONFIG_USB_MICROTEK is not set
1688CONFIG_USB_MON=y
1689 1710
1690# 1711#
1691# USB port drivers 1712# USB port drivers
@@ -1698,7 +1719,6 @@ CONFIG_USB_MON=y
1698# CONFIG_USB_EMI62 is not set 1719# CONFIG_USB_EMI62 is not set
1699# CONFIG_USB_EMI26 is not set 1720# CONFIG_USB_EMI26 is not set
1700# CONFIG_USB_ADUTUX is not set 1721# CONFIG_USB_ADUTUX is not set
1701# CONFIG_USB_AUERSWALD is not set
1702# CONFIG_USB_RIO500 is not set 1722# CONFIG_USB_RIO500 is not set
1703# CONFIG_USB_LEGOTOWER is not set 1723# CONFIG_USB_LEGOTOWER is not set
1704# CONFIG_USB_LCD is not set 1724# CONFIG_USB_LCD is not set
@@ -1715,6 +1735,7 @@ CONFIG_USB_MON=y
1715# CONFIG_USB_TRANCEVIBRATOR is not set 1735# CONFIG_USB_TRANCEVIBRATOR is not set
1716# CONFIG_USB_IOWARRIOR is not set 1736# CONFIG_USB_IOWARRIOR is not set
1717# CONFIG_USB_TEST is not set 1737# CONFIG_USB_TEST is not set
1738# CONFIG_USB_ISIGHTFW is not set
1718# CONFIG_USB_GADGET is not set 1739# CONFIG_USB_GADGET is not set
1719# CONFIG_MMC is not set 1740# CONFIG_MMC is not set
1720# CONFIG_MEMSTICK is not set 1741# CONFIG_MEMSTICK is not set
@@ -1724,7 +1745,9 @@ CONFIG_LEDS_CLASS=y
1724# 1745#
1725# LED drivers 1746# LED drivers
1726# 1747#
1748# CONFIG_LEDS_PCA9532 is not set
1727# CONFIG_LEDS_CLEVO_MAIL is not set 1749# CONFIG_LEDS_CLEVO_MAIL is not set
1750# CONFIG_LEDS_PCA955X is not set
1728 1751
1729# 1752#
1730# LED Triggers 1753# LED Triggers
@@ -1770,6 +1793,7 @@ CONFIG_RTC_INTF_DEV=y
1770# CONFIG_RTC_DRV_PCF8583 is not set 1793# CONFIG_RTC_DRV_PCF8583 is not set
1771# CONFIG_RTC_DRV_M41T80 is not set 1794# CONFIG_RTC_DRV_M41T80 is not set
1772# CONFIG_RTC_DRV_S35390A is not set 1795# CONFIG_RTC_DRV_S35390A is not set
1796# CONFIG_RTC_DRV_FM3130 is not set
1773 1797
1774# 1798#
1775# SPI RTC drivers 1799# SPI RTC drivers
@@ -1802,11 +1826,13 @@ CONFIG_DMADEVICES=y
1802# Firmware Drivers 1826# Firmware Drivers
1803# 1827#
1804# CONFIG_EDD is not set 1828# CONFIG_EDD is not set
1829CONFIG_FIRMWARE_MEMMAP=y
1805CONFIG_EFI_VARS=y 1830CONFIG_EFI_VARS=y
1806# CONFIG_DELL_RBU is not set 1831# CONFIG_DELL_RBU is not set
1807# CONFIG_DCDBAS is not set 1832# CONFIG_DCDBAS is not set
1808CONFIG_DMIID=y 1833CONFIG_DMIID=y
1809# CONFIG_ISCSI_IBFT_FIND is not set 1834CONFIG_ISCSI_IBFT_FIND=y
1835CONFIG_ISCSI_IBFT=y
1810 1836
1811# 1837#
1812# File systems 1838# File systems
@@ -1886,14 +1912,27 @@ CONFIG_HUGETLB_PAGE=y
1886# CONFIG_CRAMFS is not set 1912# CONFIG_CRAMFS is not set
1887# CONFIG_VXFS_FS is not set 1913# CONFIG_VXFS_FS is not set
1888# CONFIG_MINIX_FS is not set 1914# CONFIG_MINIX_FS is not set
1915# CONFIG_OMFS_FS is not set
1889# CONFIG_HPFS_FS is not set 1916# CONFIG_HPFS_FS is not set
1890# CONFIG_QNX4FS_FS is not set 1917# CONFIG_QNX4FS_FS is not set
1891# CONFIG_ROMFS_FS is not set 1918# CONFIG_ROMFS_FS is not set
1892# CONFIG_SYSV_FS is not set 1919# CONFIG_SYSV_FS is not set
1893# CONFIG_UFS_FS is not set 1920# CONFIG_UFS_FS is not set
1894CONFIG_NETWORK_FILESYSTEMS=y 1921CONFIG_NETWORK_FILESYSTEMS=y
1895# CONFIG_NFS_FS is not set 1922CONFIG_NFS_FS=y
1923CONFIG_NFS_V3=y
1924CONFIG_NFS_V3_ACL=y
1925CONFIG_NFS_V4=y
1926CONFIG_ROOT_NFS=y
1896# CONFIG_NFSD is not set 1927# CONFIG_NFSD is not set
1928CONFIG_LOCKD=y
1929CONFIG_LOCKD_V4=y
1930CONFIG_NFS_ACL_SUPPORT=y
1931CONFIG_NFS_COMMON=y
1932CONFIG_SUNRPC=y
1933CONFIG_SUNRPC_GSS=y
1934CONFIG_RPCSEC_GSS_KRB5=y
1935# CONFIG_RPCSEC_GSS_SPKM3 is not set
1897# CONFIG_SMB_FS is not set 1936# CONFIG_SMB_FS is not set
1898# CONFIG_CIFS is not set 1937# CONFIG_CIFS is not set
1899# CONFIG_NCP_FS is not set 1938# CONFIG_NCP_FS is not set
@@ -1967,9 +2006,9 @@ CONFIG_NLS_UTF8=y
1967# Kernel hacking 2006# Kernel hacking
1968# 2007#
1969CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2008CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1970# CONFIG_PRINTK_TIME is not set 2009CONFIG_PRINTK_TIME=y
1971# CONFIG_ENABLE_WARN_DEPRECATED is not set 2010CONFIG_ENABLE_WARN_DEPRECATED=y
1972# CONFIG_ENABLE_MUST_CHECK is not set 2011CONFIG_ENABLE_MUST_CHECK=y
1973CONFIG_FRAME_WARN=2048 2012CONFIG_FRAME_WARN=2048
1974CONFIG_MAGIC_SYSRQ=y 2013CONFIG_MAGIC_SYSRQ=y
1975# CONFIG_UNUSED_SYMBOLS is not set 2014# CONFIG_UNUSED_SYMBOLS is not set
@@ -1998,6 +2037,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
1998# CONFIG_DEBUG_INFO is not set 2037# CONFIG_DEBUG_INFO is not set
1999# CONFIG_DEBUG_VM is not set 2038# CONFIG_DEBUG_VM is not set
2000# CONFIG_DEBUG_WRITECOUNT is not set 2039# CONFIG_DEBUG_WRITECOUNT is not set
2040CONFIG_DEBUG_MEMORY_INIT=y
2001# CONFIG_DEBUG_LIST is not set 2041# CONFIG_DEBUG_LIST is not set
2002# CONFIG_DEBUG_SG is not set 2042# CONFIG_DEBUG_SG is not set
2003CONFIG_FRAME_POINTER=y 2043CONFIG_FRAME_POINTER=y
@@ -2008,11 +2048,20 @@ CONFIG_FRAME_POINTER=y
2008# CONFIG_LKDTM is not set 2048# CONFIG_LKDTM is not set
2009# CONFIG_FAULT_INJECTION is not set 2049# CONFIG_FAULT_INJECTION is not set
2010# CONFIG_LATENCYTOP is not set 2050# CONFIG_LATENCYTOP is not set
2051CONFIG_SYSCTL_SYSCALL_CHECK=y
2052CONFIG_HAVE_FTRACE=y
2053CONFIG_HAVE_DYNAMIC_FTRACE=y
2054# CONFIG_FTRACE is not set
2055# CONFIG_IRQSOFF_TRACER is not set
2056# CONFIG_SYSPROF_TRACER is not set
2057# CONFIG_SCHED_TRACER is not set
2058# CONFIG_CONTEXT_SWITCH_TRACER is not set
2011CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2059CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set 2060# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y 2061CONFIG_HAVE_ARCH_KGDB=y
2062# CONFIG_KGDB is not set
2015# CONFIG_STRICT_DEVMEM is not set 2063# CONFIG_STRICT_DEVMEM is not set
2064CONFIG_X86_VERBOSE_BOOTUP=y
2016CONFIG_EARLY_PRINTK=y 2065CONFIG_EARLY_PRINTK=y
2017CONFIG_DEBUG_STACKOVERFLOW=y 2066CONFIG_DEBUG_STACKOVERFLOW=y
2018CONFIG_DEBUG_STACK_USAGE=y 2067CONFIG_DEBUG_STACK_USAGE=y
@@ -2023,8 +2072,8 @@ CONFIG_DEBUG_RODATA=y
2023# CONFIG_DIRECT_GBPAGES is not set 2072# CONFIG_DIRECT_GBPAGES is not set
2024# CONFIG_DEBUG_RODATA_TEST is not set 2073# CONFIG_DEBUG_RODATA_TEST is not set
2025CONFIG_DEBUG_NX_TEST=m 2074CONFIG_DEBUG_NX_TEST=m
2026CONFIG_X86_MPPARSE=y
2027# CONFIG_IOMMU_DEBUG is not set 2075# CONFIG_IOMMU_DEBUG is not set
2076# CONFIG_MMIOTRACE is not set
2028CONFIG_IO_DELAY_TYPE_0X80=0 2077CONFIG_IO_DELAY_TYPE_0X80=0
2029CONFIG_IO_DELAY_TYPE_0XED=1 2078CONFIG_IO_DELAY_TYPE_0XED=1
2030CONFIG_IO_DELAY_TYPE_UDELAY=2 2079CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2036,6 +2085,7 @@ CONFIG_IO_DELAY_0X80=y
2036CONFIG_DEFAULT_IO_DELAY_TYPE=0 2085CONFIG_DEFAULT_IO_DELAY_TYPE=0
2037CONFIG_DEBUG_BOOT_PARAMS=y 2086CONFIG_DEBUG_BOOT_PARAMS=y
2038# CONFIG_CPA_DEBUG is not set 2087# CONFIG_CPA_DEBUG is not set
2088CONFIG_OPTIMIZE_INLINING=y
2039 2089
2040# 2090#
2041# Security options 2091# Security options
@@ -2045,7 +2095,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2045CONFIG_SECURITY=y 2095CONFIG_SECURITY=y
2046CONFIG_SECURITY_NETWORK=y 2096CONFIG_SECURITY_NETWORK=y
2047# CONFIG_SECURITY_NETWORK_XFRM is not set 2097# CONFIG_SECURITY_NETWORK_XFRM is not set
2048CONFIG_SECURITY_CAPABILITIES=y
2049CONFIG_SECURITY_FILE_CAPABILITIES=y 2098CONFIG_SECURITY_FILE_CAPABILITIES=y
2050# CONFIG_SECURITY_ROOTPLUG is not set 2099# CONFIG_SECURITY_ROOTPLUG is not set
2051CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2100CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2106,6 +2155,10 @@ CONFIG_CRYPTO_HMAC=y
2106# CONFIG_CRYPTO_MD4 is not set 2155# CONFIG_CRYPTO_MD4 is not set
2107CONFIG_CRYPTO_MD5=y 2156CONFIG_CRYPTO_MD5=y
2108# CONFIG_CRYPTO_MICHAEL_MIC is not set 2157# CONFIG_CRYPTO_MICHAEL_MIC is not set
2158# CONFIG_CRYPTO_RMD128 is not set
2159# CONFIG_CRYPTO_RMD160 is not set
2160# CONFIG_CRYPTO_RMD256 is not set
2161# CONFIG_CRYPTO_RMD320 is not set
2109CONFIG_CRYPTO_SHA1=y 2162CONFIG_CRYPTO_SHA1=y
2110# CONFIG_CRYPTO_SHA256 is not set 2163# CONFIG_CRYPTO_SHA256 is not set
2111# CONFIG_CRYPTO_SHA512 is not set 2164# CONFIG_CRYPTO_SHA512 is not set
@@ -2155,6 +2208,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2155CONFIG_GENERIC_FIND_NEXT_BIT=y 2208CONFIG_GENERIC_FIND_NEXT_BIT=y
2156# CONFIG_CRC_CCITT is not set 2209# CONFIG_CRC_CCITT is not set
2157# CONFIG_CRC16 is not set 2210# CONFIG_CRC16 is not set
2211CONFIG_CRC_T10DIF=y
2158# CONFIG_CRC_ITU_T is not set 2212# CONFIG_CRC_ITU_T is not set
2159CONFIG_CRC32=y 2213CONFIG_CRC32=y
2160# CONFIG_CRC7 is not set 2214# CONFIG_CRC7 is not set
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de5403..903de4aa5094 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12 12
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14
13aes-i586-y := aes-i586-asm_32.o aes_glue.o 15aes-i586-y := aes-i586-asm_32.o aes_glue.o
14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 16twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 17salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 000000000000..070afc5b6c94
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/kernel.h>
22#include <crypto/internal/hash.h>
23
24#include <asm/cpufeature.h>
25
26#define CHKSUM_BLOCK_SIZE 1
27#define CHKSUM_DIGEST_SIZE 4
28
29#define SCALE_F sizeof(unsigned long)
30
31#ifdef CONFIG_X86_64
32#define REX_PRE "0x48, "
33#else
34#define REX_PRE
35#endif
36
37static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
38{
39 while (length--) {
40 __asm__ __volatile__(
41 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
42 :"=S"(crc)
43 :"0"(crc), "c"(*data)
44 );
45 data++;
46 }
47
48 return crc;
49}
50
51static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient = len / SCALE_F;
54 unsigned int iremainder = len % SCALE_F;
55 unsigned long *ptmp = (unsigned long *)p;
56
57 while (iquotient--) {
58 __asm__ __volatile__(
59 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
60 :"=S"(crc)
61 :"0"(crc), "c"(*ptmp)
62 );
63 ptmp++;
64 }
65
66 if (iremainder)
67 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
68 iremainder);
69
70 return crc;
71}
72
73/*
74 * Setting the seed allows arbitrary accumulators and flexible XOR policy
75 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed.
77 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
79 unsigned int keylen)
80{
81 u32 *mctx = crypto_ahash_ctx(hash);
82
83 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL;
86 }
87 *mctx = le32_to_cpup((__le32 *)key);
88 return 0;
89}
90
91static int crc32c_intel_init(struct ahash_request *req)
92{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
94 u32 *crcp = ahash_request_ctx(req);
95
96 *crcp = *mctx;
97
98 return 0;
99}
100
101static int crc32c_intel_update(struct ahash_request *req)
102{
103 struct crypto_hash_walk walk;
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111
112 *crcp = crc;
113 return 0;
114}
115
116static int crc32c_intel_final(struct ahash_request *req)
117{
118 u32 *crcp = ahash_request_ctx(req);
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0;
122}
123
124static int crc32c_intel_digest(struct ahash_request *req)
125{
126 struct crypto_hash_walk walk;
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
128 u32 crc = *mctx;
129 int nbytes;
130
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
132 nbytes = crypto_hash_walk_done(&walk, 0))
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
134
135 *(__le32 *)req->result = ~cpu_to_le32(crc);
136 return 0;
137}
138
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{
141 u32 *key = crypto_tfm_ctx(tfm);
142
143 *key = ~0;
144
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0;
148}
149
150static struct crypto_alg alg = {
151 .cra_name = "crc32c",
152 .cra_driver_name = "crc32c-intel",
153 .cra_priority = 200,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE,
156 .cra_alignmask = 3,
157 .cra_ctxsize = sizeof(u32),
158 .cra_module = THIS_MODULE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list),
160 .cra_init = crc32c_intel_cra_init,
161 .cra_type = &crypto_ahash_type,
162 .cra_u = {
163 .ahash = {
164 .digestsize = CHKSUM_DIGEST_SIZE,
165 .setkey = crc32c_intel_setkey,
166 .init = crc32c_intel_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 }
172};
173
174
175static int __init crc32c_intel_mod_init(void)
176{
177 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg);
179 else
180 return -ENODEV;
181}
182
183static void __exit crc32c_intel_mod_fini(void)
184{
185 crypto_unregister_alg(&alg);
186}
187
188module_init(crc32c_intel_mod_init);
189module_exit(crc32c_intel_mod_fini);
190
191MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
192MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
193MODULE_LICENSE("GPL");
194
195MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..127ec3f07214 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
85 dump->regs.ax = regs->ax; 85 dump->regs.ax = regs->ax;
86 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
87 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 savesegment(fs, fs);
89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 dump->regs.fs = fs;
90 savesegment(gs, gs);
91 dump->regs.gs = gs;
90 dump->regs.orig_ax = regs->orig_ax; 92 dump->regs.orig_ax = regs->orig_ax;
91 dump->regs.ip = regs->ip; 93 dump->regs.ip = regs->ip;
92 dump->regs.cs = regs->cs; 94 dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
430 current->mm->start_stack = 432 current->mm->start_stack =
431 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); 433 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
432 /* start thread */ 434 /* start thread */
433 asm volatile("movl %0,%%fs" :: "r" (0)); \ 435 loadsegment(fs, 0);
434 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 436 loadsegment(ds, __USER32_DS);
437 loadsegment(es, __USER32_DS);
435 load_gs_index(0); 438 load_gs_index(0);
436 (regs)->ip = ex.a_entry; 439 (regs)->ip = ex.a_entry;
437 (regs)->sp = current->mm->start_stack; 440 (regs)->sp = current->mm->start_stack;
@@ -441,12 +444,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 444 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 445 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 446 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 447 return 0;
451} 448}
452 449
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 20af4c79579a..4bc02b23674b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -179,9 +179,10 @@ struct sigframe
179 u32 pretcode; 179 u32 pretcode;
180 int sig; 180 int sig;
181 struct sigcontext_ia32 sc; 181 struct sigcontext_ia32 sc;
182 struct _fpstate_ia32 fpstate; 182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
183 unsigned int extramask[_COMPAT_NSIG_WORDS-1]; 183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
184 char retcode[8]; 184 char retcode[8];
185 /* fp state follows here */
185}; 186};
186 187
187struct rt_sigframe 188struct rt_sigframe
@@ -192,8 +193,8 @@ struct rt_sigframe
192 u32 puc; 193 u32 puc;
193 compat_siginfo_t info; 194 compat_siginfo_t info;
194 struct ucontext_ia32 uc; 195 struct ucontext_ia32 uc;
195 struct _fpstate_ia32 fpstate;
196 char retcode[8]; 196 char retcode[8];
197 /* fp state follows here */
197}; 198};
198 199
199#define COPY(x) { \ 200#define COPY(x) { \
@@ -206,7 +207,7 @@ struct rt_sigframe
206 { unsigned int cur; \ 207 { unsigned int cur; \
207 unsigned short pre; \ 208 unsigned short pre; \
208 err |= __get_user(pre, &sc->seg); \ 209 err |= __get_user(pre, &sc->seg); \
209 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 210 savesegment(seg, cur); \
210 pre |= mask; \ 211 pre |= mask; \
211 if (pre != cur) loadsegment(seg, pre); } 212 if (pre != cur) loadsegment(seg, pre); }
212 213
@@ -215,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
215 unsigned int *peax) 216 unsigned int *peax)
216{ 217{
217 unsigned int tmpflags, gs, oldgs, err = 0; 218 unsigned int tmpflags, gs, oldgs, err = 0;
218 struct _fpstate_ia32 __user *buf; 219 void __user *buf;
219 u32 tmp; 220 u32 tmp;
220 221
221 /* Always make any pending restarted system calls return -EINTR */ 222 /* Always make any pending restarted system calls return -EINTR */
@@ -235,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
235 */ 236 */
236 err |= __get_user(gs, &sc->gs); 237 err |= __get_user(gs, &sc->gs);
237 gs |= 3; 238 gs |= 3;
238 asm("movl %%gs,%0" : "=r" (oldgs)); 239 savesegment(gs, oldgs);
239 if (gs != oldgs) 240 if (gs != oldgs)
240 load_gs_index(gs); 241 load_gs_index(gs);
241 242
@@ -259,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
259 260
260 err |= __get_user(tmp, &sc->fpstate); 261 err |= __get_user(tmp, &sc->fpstate);
261 buf = compat_ptr(tmp); 262 buf = compat_ptr(tmp);
262 if (buf) { 263 err |= restore_i387_xstate_ia32(buf);
263 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
264 goto badframe;
265 err |= restore_i387_ia32(buf);
266 } else {
267 struct task_struct *me = current;
268
269 if (used_math()) {
270 clear_fpu(me);
271 clear_used_math();
272 }
273 }
274 264
275 err |= __get_user(tmp, &sc->ax); 265 err |= __get_user(tmp, &sc->ax);
276 *peax = tmp; 266 *peax = tmp;
277 267
278 return err; 268 return err;
279
280badframe:
281 return 1;
282} 269}
283 270
284asmlinkage long sys32_sigreturn(struct pt_regs *regs) 271asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -350,46 +337,42 @@ badframe:
350 */ 337 */
351 338
352static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, 339static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
353 struct _fpstate_ia32 __user *fpstate, 340 void __user *fpstate,
354 struct pt_regs *regs, unsigned int mask) 341 struct pt_regs *regs, unsigned int mask)
355{ 342{
356 int tmp, err = 0; 343 int tmp, err = 0;
357 344
358 tmp = 0; 345 savesegment(gs, tmp);
359 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
360 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 346 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
361 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); 347 savesegment(fs, tmp);
362 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 348 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
363 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); 349 savesegment(ds, tmp);
364 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 350 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
365 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 351 savesegment(es, tmp);
366 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
367 353
368 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
369 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
370 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
371 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
372 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
373 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
374 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
375 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
376 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
377 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
378 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
379 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
380 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
381 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
382 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
383 369
384 tmp = save_i387_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
385 if (tmp < 0) 371 if (tmp < 0)
386 err = -EFAULT; 372 err = -EFAULT;
387 else { 373 else
388 clear_used_math();
389 stts();
390 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
391 &sc->fpstate); 375 &sc->fpstate);
392 }
393 376
394 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
395 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -402,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
402 * Determine which stack to use.. 385 * Determine which stack to use..
403 */ 386 */
404static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 387static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
405 size_t frame_size) 388 size_t frame_size,
389 void **fpstate)
406{ 390{
407 unsigned long sp; 391 unsigned long sp;
408 392
@@ -421,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
421 ka->sa.sa_restorer) 405 ka->sa.sa_restorer)
422 sp = (unsigned long) ka->sa.sa_restorer; 406 sp = (unsigned long) ka->sa.sa_restorer;
423 407
408 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp;
411 }
412
424 sp -= frame_size; 413 sp -= frame_size;
425 /* Align the stack pointer according to the i386 ABI, 414 /* Align the stack pointer according to the i386 ABI,
426 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 415 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -434,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
434 struct sigframe __user *frame; 423 struct sigframe __user *frame;
435 void __user *restorer; 424 void __user *restorer;
436 int err = 0; 425 int err = 0;
426 void __user *fpstate = NULL;
437 427
438 /* copy_to_user optimizes that into a single 8 byte store */ 428 /* copy_to_user optimizes that into a single 8 byte store */
439 static const struct { 429 static const struct {
@@ -448,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
448 0, 438 0,
449 }; 439 };
450 440
451 frame = get_sigframe(ka, regs, sizeof(*frame)); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
452 442
453 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
454 goto give_sigsegv; 444 return -EFAULT;
455 445
456 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
457 if (err) 447 return -EFAULT;
458 goto give_sigsegv;
459 448
460 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
461 set->sig[0]); 450 return -EFAULT;
462 if (err)
463 goto give_sigsegv;
464 451
465 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
466 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
467 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
468 if (err) 455 return -EFAULT;
469 goto give_sigsegv;
470 } 456 }
471 457
472 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -487,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
487 */ 473 */
488 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
489 if (err) 475 if (err)
490 goto give_sigsegv; 476 return -EFAULT;
491 477
492 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
493 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -498,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
498 regs->dx = 0; 484 regs->dx = 0;
499 regs->cx = 0; 485 regs->cx = 0;
500 486
501 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 487 loadsegment(ds, __USER32_DS);
502 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 488 loadsegment(es, __USER32_DS);
503 489
504 regs->cs = __USER32_CS; 490 regs->cs = __USER32_CS;
505 regs->ss = __USER32_DS; 491 regs->ss = __USER32_DS;
@@ -510,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
510#endif 496#endif
511 497
512 return 0; 498 return 0;
513
514give_sigsegv:
515 force_sigsegv(sig, current);
516 return -EFAULT;
517} 499}
518 500
519int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -522,6 +504,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
522 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
523 void __user *restorer; 505 void __user *restorer;
524 int err = 0; 506 int err = 0;
507 void __user *fpstate = NULL;
525 508
526 /* __copy_to_user optimizes that into a single 8 byte store */ 509 /* __copy_to_user optimizes that into a single 8 byte store */
527 static const struct { 510 static const struct {
@@ -537,30 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
537 0, 520 0,
538 }; 521 };
539 522
540 frame = get_sigframe(ka, regs, sizeof(*frame)); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
541 524
542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
543 goto give_sigsegv; 526 return -EFAULT;
544 527
545 err |= __put_user(sig, &frame->sig); 528 err |= __put_user(sig, &frame->sig);
546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
548 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
549 if (err) 532 if (err)
550 goto give_sigsegv; 533 return -EFAULT;
551 534
552 /* Create the ucontext. */ 535 /* Create the ucontext. */
553 err |= __put_user(0, &frame->uc.uc_flags); 536 if (cpu_has_xsave)
537 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
538 else
539 err |= __put_user(0, &frame->uc.uc_flags);
554 err |= __put_user(0, &frame->uc.uc_link); 540 err |= __put_user(0, &frame->uc.uc_link);
555 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 541 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
556 err |= __put_user(sas_ss_flags(regs->sp), 542 err |= __put_user(sas_ss_flags(regs->sp),
557 &frame->uc.uc_stack.ss_flags); 543 &frame->uc.uc_stack.ss_flags);
558 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 544 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
559 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 545 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
560 regs, set->sig[0]); 546 regs, set->sig[0]);
561 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
562 if (err) 548 if (err)
563 goto give_sigsegv; 549 return -EFAULT;
564 550
565 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
566 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -575,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
575 */ 561 */
576 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
577 if (err) 563 if (err)
578 goto give_sigsegv; 564 return -EFAULT;
579 565
580 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
581 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -591,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
591 regs->dx = (unsigned long) &frame->info; 577 regs->dx = (unsigned long) &frame->info;
592 regs->cx = (unsigned long) &frame->uc; 578 regs->cx = (unsigned long) &frame->uc;
593 579
594 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 580 loadsegment(ds, __USER32_DS);
595 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 581 loadsegment(es, __USER32_DS);
596 582
597 regs->cs = __USER32_CS; 583 regs->cs = __USER32_CS;
598 regs->ss = __USER32_DS; 584 regs->ss = __USER32_DS;
@@ -603,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
603#endif 589#endif
604 590
605 return 0; 591 return 0;
606
607give_sigsegv:
608 force_sigsegv(sig, current);
609 return -EFAULT;
610} 592}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d3c64088b981..beda4232ce69 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
556 return ret; 556 return ret;
557} 557}
558 558
559/* These are here just in case some old ia32 binary calls it. */
560asmlinkage long sys32_pause(void)
561{
562 current->state = TASK_INTERRUPTIBLE;
563 schedule();
564 return -ERESTARTNOHAND;
565}
566
567
568#ifdef CONFIG_SYSCTL_SYSCALL 559#ifdef CONFIG_SYSCTL_SYSCALL
569struct sysctl_ia32 { 560struct sysctl_ia32 {
570 unsigned int name; 561 unsigned int name;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651fc8ec5..5098585f87ce 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
38 38
39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
40obj-y += process.o 40obj-y += process.o
41obj-y += i387.o 41obj-y += i387.o xsave.o
42obj-y += ptrace.o 42obj-y += ptrace.o
43obj-y += ds.o 43obj-y += ds.o
44obj-$(CONFIG_X86_32) += tls.o 44obj-$(CONFIG_X86_32) += tls.o
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
51obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
52obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
53obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
54obj-$(CONFIG_MICROCODE) += microcode.o
55obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
56apm-y := apm_32.o 55apm-y := apm_32.o
57obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -69,6 +68,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
71obj-$(CONFIG_X86_ES7000) += es7000_32.o
72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
73obj-y += vsmp_64.o 73obj-y += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
90obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
94 94
95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -99,11 +99,18 @@ scx200-y += scx200_32.o
99 99
100obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
101 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
102### 107###
103# 64 bit specific files 108# 64 bit specific files
104ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o 111 obj-y += bios_uv.o
112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o
107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
108 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
109 116
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fa88a1d71290..eb875cdc7367 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -158,6 +157,16 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
158struct acpi_mcfg_allocation *pci_mmcfg_config; 157struct acpi_mcfg_allocation *pci_mmcfg_config;
159int pci_mmcfg_config_num; 158int pci_mmcfg_config_num;
160 159
160static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
161
162static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
163{
164 if (!strcmp(mcfg->header.oem_id, "SGI"))
165 acpi_mcfg_64bit_base_addr = TRUE;
166
167 return 0;
168}
169
161int __init acpi_parse_mcfg(struct acpi_table_header *header) 170int __init acpi_parse_mcfg(struct acpi_table_header *header)
162{ 171{
163 struct acpi_table_mcfg *mcfg; 172 struct acpi_table_mcfg *mcfg;
@@ -190,8 +199,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
190 } 199 }
191 200
192 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 201 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
202
203 acpi_mcfg_oem_check(mcfg);
204
193 for (i = 0; i < pci_mmcfg_config_num; ++i) { 205 for (i = 0; i < pci_mmcfg_config_num; ++i) {
194 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 206 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
207 !acpi_mcfg_64bit_base_addr) {
195 printk(KERN_ERR PREFIX 208 printk(KERN_ERR PREFIX
196 "MMCONFIG not in low 4GB of memory\n"); 209 "MMCONFIG not in low 4GB of memory\n");
197 kfree(pci_mmcfg_config); 210 kfree(pci_mmcfg_config);
@@ -239,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
239 return; 252 return;
240 } 253 }
241 254
242#ifdef CONFIG_X86_32
243 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
244 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
245#endif
246 257
247 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
248} 259}
@@ -761,11 +772,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
761 772
762 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
763 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
764 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 775 boot_cpu_physical_apicid = read_apic_id();
765#ifdef CONFIG_X86_32
766 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
767 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
768#endif
769 } 778 }
770} 779}
771 780
@@ -1337,7 +1346,9 @@ static void __init acpi_process_madt(void)
1337 acpi_ioapic = 1; 1346 acpi_ioapic = 1;
1338 1347
1339 smp_found_config = 1; 1348 smp_found_config = 1;
1349#ifdef CONFIG_X86_32
1340 setup_apic_routing(); 1350 setup_apic_routing();
1351#endif
1341 } 1352 }
1342 } 1353 }
1343 if (error == -EINVAL) { 1354 if (error == -EINVAL) {
@@ -1407,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1407 */ 1418 */
1408static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1419static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1409{ 1420{
1410 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1421 /*
1411 acpi_skip_timer_override = 1; 1422 * The ati_ixp4x0_rev() early PCI quirk should have set
1423 * the acpi_skip_timer_override flag already:
1424 */
1425 if (!acpi_skip_timer_override) {
1426 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1427 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1428 d->ident);
1429 acpi_skip_timer_override = 1;
1430 }
1412 return 0; 1431 return 0;
1413} 1432}
1414 1433
@@ -1591,6 +1610,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1591 */ 1610 */
1592 { 1611 {
1593 .callback = dmi_ignore_irq0_timer_override, 1612 .callback = dmi_ignore_irq0_timer_override,
1613 .ident = "HP nx6115 laptop",
1614 .matches = {
1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1616 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1617 },
1618 },
1619 {
1620 .callback = dmi_ignore_irq0_timer_override,
1594 .ident = "HP NX6125 laptop", 1621 .ident = "HP NX6125 laptop",
1595 .matches = { 1622 .matches = {
1596 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1623 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1605,6 +1632,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1605 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1606 }, 1633 },
1607 }, 1634 },
1635 {
1636 .callback = dmi_ignore_irq0_timer_override,
1637 .ident = "HP 6715b laptop",
1638 .matches = {
1639 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1640 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1641 },
1642 },
1608 {} 1643 {}
1609}; 1644};
1610 1645
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 9220cf46aa10..c2502eb9aa83 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,7 +73,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
73 struct cpuinfo_x86 *c = &cpu_data(cpu); 73 struct cpuinfo_x86 *c = &cpu_data(cpu);
74 74
75 cpumask_t saved_mask; 75 cpumask_t saved_mask;
76 cpumask_of_cpu_ptr(new_mask, cpu);
77 int retval; 76 int retval;
78 unsigned int eax, ebx, ecx, edx; 77 unsigned int eax, ebx, ecx, edx;
79 unsigned int edx_part; 78 unsigned int edx_part;
@@ -92,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
92 91
93 /* Make sure we are running on right CPU */ 92 /* Make sure we are running on right CPU */
94 saved_mask = current->cpus_allowed; 93 saved_mask = current->cpus_allowed;
95 retval = set_cpus_allowed_ptr(current, new_mask); 94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
96 if (retval) 95 if (retval)
97 return -1; 96 return -1;
98 97
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fa2161d5003b..426e5d91b63a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags;
20/* address in low memory of the wakeup routine. */ 20/* address in low memory of the wakeup routine. */
21static unsigned long acpi_realmode; 21static unsigned long acpi_realmode;
22 22
23#ifdef CONFIG_64BIT 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
24static char temp_stack[10240]; 24static char temp_stack[10240];
25#endif 25#endif
26 26
@@ -86,7 +86,7 @@ int acpi_save_state_mem(void)
86#endif /* !CONFIG_64BIT */ 86#endif /* !CONFIG_64BIT */
87 87
88 header->pmode_cr0 = read_cr0(); 88 header->pmode_cr0 = read_cr0();
89 header->pmode_cr4 = read_cr4(); 89 header->pmode_cr4 = read_cr4_safe();
90 header->realmode_flags = acpi_realmode_flags; 90 header->realmode_flags = acpi_realmode_flags;
91 header->real_magic = 0x12345678; 91 header->real_magic = 0x12345678;
92 92
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..fb04e49776ba 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146const unsigned char *const *find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop {
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165const unsigned char *const *find_nop_table(void) 157const unsigned char *const *find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
241 continue; 231 continue;
242 if (*ptr > text_end) 232 if (*ptr > text_end)
243 continue; 233 continue;
244 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
245 }; 236 };
246} 237}
247 238
248static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
249{ 240{
250 u8 **ptr; 241 u8 **ptr;
251 char insn[1];
252 242
253 if (noreplace_smp) 243 if (noreplace_smp)
254 return; 244 return;
255 245
256 add_nops(insn, 1);
257 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
258 if (*ptr < text) 247 if (*ptr < text)
259 continue; 248 continue;
260 if (*ptr > text_end) 249 if (*ptr > text_end)
261 continue; 250 continue;
262 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
263 }; 253 };
264} 254}
265 255
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c25210e6ac88..34e4d112b1ef 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -29,13 +29,14 @@
29 29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31 31
32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34
35#define EXIT_LOOP_COUNT 10000000 32#define EXIT_LOOP_COUNT 10000000
36 33
37static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
38 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
39/* 40/*
40 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
41 */ 42 */
@@ -54,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
54 55
55/**************************************************************************** 56/****************************************************************************
56 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
57 * IOMMU command queuing functions 154 * IOMMU command queuing functions
58 * 155 *
59 ****************************************************************************/ 156 ****************************************************************************/
@@ -68,7 +165,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
68 u8 *target; 165 u8 *target;
69 166
70 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 167 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
71 target = (iommu->cmd_buf + tail); 168 target = iommu->cmd_buf + tail;
72 memcpy_toio(target, cmd, sizeof(*cmd)); 169 memcpy_toio(target, cmd, sizeof(*cmd));
73 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 170 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
74 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 171 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -104,32 +201,39 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
104 */ 201 */
105static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
106{ 203{
107 int ret; 204 int ret = 0, ready = 0;
205 unsigned status = 0;
108 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
109 volatile u64 ready = 0; 207 unsigned long flags, i = 0;
110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
112 208
113 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
115 cmd.data[1] = upper_32_bits(ready_phys);
116 cmd.data[2] = 1; /* value written to 'ready' */
117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
118 212
119 iommu->need_sync = 0; 213 iommu->need_sync = 0;
120 214
121 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
122 218
123 if (ret) 219 if (ret)
124 return ret; 220 goto out;
125 221
126 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i; 223 ++i;
128 cpu_relax(); 224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
129 } 227 }
130 228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
232
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
133 237
134 return 0; 238 return 0;
135} 239}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{ 245{
142 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
143 248
144 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
145 250
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid; 253 cmd.data[0] = devid;
149 254
255 ret = iommu_queue_command(iommu, &cmd);
256
150 iommu->need_sync = 1; 257 iommu->need_sync = 1;
151 258
152 return iommu_queue_command(iommu, &cmd); 259 return ret;
153} 260}
154 261
155/* 262/*
@@ -159,21 +266,24 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
160{ 267{
161 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
162 270
163 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK; 272 address &= PAGE_MASK;
165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
166 cmd.data[1] |= domid; 274 cmd.data[1] |= domid;
167 cmd.data[2] = LOW_U32(address); 275 cmd.data[2] = lower_32_bits(address);
168 cmd.data[3] = upper_32_bits(address); 276 cmd.data[3] = upper_32_bits(address);
169 if (s) /* size bit - we flush more than one 4kb page */ 277 if (s) /* size bit - we flush more than one 4kb page */
170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173 281
282 ret = iommu_queue_command(iommu, &cmd);
283
174 iommu->need_sync = 1; 284 iommu->need_sync = 1;
175 285
176 return iommu_queue_command(iommu, &cmd); 286 return ret;
177} 287}
178 288
179/* 289/*
@@ -185,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
185 u64 address, size_t size) 295 u64 address, size_t size)
186{ 296{
187 int s = 0; 297 int s = 0;
188 unsigned pages = to_pages(address, size); 298 unsigned pages = iommu_num_pages(address, size);
189 299
190 address &= PAGE_MASK; 300 address &= PAGE_MASK;
191 301
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
203 return 0; 313 return 0;
204} 314}
205 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
206/**************************************************************************** 324/****************************************************************************
207 * 325 *
208 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
362 * efficient allocator. 480 * efficient allocator.
363 * 481 *
364 ****************************************************************************/ 482 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370 483
371/* 484/*
372 * The address allocator core function. 485 * The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
375 */ 488 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
378 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
379{ 494{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
381 unsigned long address; 496 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size; 497 unsigned long boundary_size;
384 498
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
388 503
389 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
390 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
391 508
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
394 if (address == -1) 511 if (address == -1) {
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
397 516
398 if (likely(address != -1)) { 517 if (likely(address != -1)) {
399 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
459 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
460 pages = last_page - start_page; 579 pages = last_page - start_page;
461 580
462 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
463} 582}
464 583
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,12 +672,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
553 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
555 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
556 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
560 int pages = to_pages(iommu->exclusion_start, 682 int pages = iommu_num_pages(iommu->exclusion_start,
561 iommu->exclusion_length); 683 iommu->exclusion_length);
562 dma_ops_reserve_addresses(dma_dom, startpage, pages); 684 dma_ops_reserve_addresses(dma_dom, startpage, pages);
563 } 685 }
564 686
@@ -623,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
623 745
624 u64 pte_root = virt_to_phys(domain->pt_root); 746 u64 pte_root = virt_to_phys(domain->pt_root);
625 747
626 pte_root |= (domain->mode & 0x07) << 9; 748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
628 751
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root; 753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
632 amd_iommu_dev_table[devid].data[2] = domain->id; 755 amd_iommu_dev_table[devid].data[2] = domain->id;
633 756
634 amd_iommu_pd_table[devid] = domain; 757 amd_iommu_pd_table[devid] = domain;
@@ -646,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
646 *****************************************************************************/ 769 *****************************************************************************/
647 770
648/* 771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
649 * In the dma_ops path we only have the struct device. This function 811 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the 812 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device. 813 * requestor id for a given device.
@@ -661,27 +823,30 @@ static int get_device_resources(struct device *dev,
661 struct pci_dev *pcidev; 823 struct pci_dev *pcidev;
662 u16 _bdf; 824 u16 _bdf;
663 825
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
665 832
666 pcidev = to_pci_dev(dev); 833 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 835
669 /* device not translated by any IOMMU in the system? */ 836 /* device not translated by any IOMMU in the system? */
670 if (_bdf >= amd_iommu_last_bdf) { 837 if (_bdf > amd_iommu_last_bdf)
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0; 838 return 0;
675 }
676 839
677 *bdf = amd_iommu_alias_table[_bdf]; 840 *bdf = amd_iommu_alias_table[_bdf];
678 841
679 *iommu = amd_iommu_rlookup_table[*bdf]; 842 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL) 843 if (*iommu == NULL)
681 return 0; 844 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf); 845 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) { 846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
685 *domain = &dma_dom->domain; 850 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf); 851 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
760 struct dma_ops_domain *dma_dom, 925 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr, 926 phys_addr_t paddr,
762 size_t size, 927 size_t size,
763 int dir) 928 int dir,
929 bool align,
930 u64 dma_mask)
764{ 931{
765 dma_addr_t offset = paddr & ~PAGE_MASK; 932 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start; 933 dma_addr_t address, start;
767 unsigned int pages; 934 unsigned int pages;
935 unsigned long align_mask = 0;
768 int i; 936 int i;
769 937
770 pages = to_pages(paddr, size); 938 pages = iommu_num_pages(paddr, size);
771 paddr &= PAGE_MASK; 939 paddr &= PAGE_MASK;
772 940
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
774 if (unlikely(address == bad_dma_address)) 946 if (unlikely(address == bad_dma_address))
775 goto out; 947 goto out;
776 948
@@ -782,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
782 } 954 }
783 address += offset; 955 address += offset;
784 956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
785out: 963out:
786 return address; 964 return address;
787} 965}
@@ -802,7 +980,7 @@ static void __unmap_single(struct amd_iommu *iommu,
802 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 980 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
803 return; 981 return;
804 982
805 pages = to_pages(dma_addr, size); 983 pages = iommu_num_pages(dma_addr, size);
806 dma_addr &= PAGE_MASK; 984 dma_addr &= PAGE_MASK;
807 start = dma_addr; 985 start = dma_addr;
808 986
@@ -812,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
812 } 990 }
813 991
814 dma_ops_free_addresses(dma_dom, dma_addr, pages); 992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
815} 996}
816 997
817/* 998/*
@@ -825,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
825 struct protection_domain *domain; 1006 struct protection_domain *domain;
826 u16 devid; 1007 u16 devid;
827 dma_addr_t addr; 1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
828 1015
829 get_device_resources(dev, &iommu, &domain, &devid); 1016 get_device_resources(dev, &iommu, &domain, &devid);
830 1017
@@ -833,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
833 return (dma_addr_t)paddr; 1020 return (dma_addr_t)paddr;
834 1021
835 spin_lock_irqsave(&domain->lock, flags); 1022 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
837 if (addr == bad_dma_address) 1025 if (addr == bad_dma_address)
838 goto out; 1026 goto out;
839 1027
840 if (iommu_has_npcache(iommu)) 1028 if (unlikely(iommu->need_sync))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu); 1029 iommu_completion_wait(iommu);
845 1030
846out: 1031out:
@@ -860,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
860 struct protection_domain *domain; 1045 struct protection_domain *domain;
861 u16 devid; 1046 u16 devid;
862 1047
863 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */ 1050 /* device not handled by any AMD IOMMU */
865 return; 1051 return;
866 1052
@@ -868,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
868 1054
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870 1056
871 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1057 if (unlikely(iommu->need_sync))
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu); 1058 iommu_completion_wait(iommu);
875 1059
876 spin_unlock_irqrestore(&domain->lock, flags); 1060 spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
909 struct scatterlist *s; 1093 struct scatterlist *s;
910 phys_addr_t paddr; 1094 phys_addr_t paddr;
911 int mapped_elems = 0; 1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
912 1102
913 get_device_resources(dev, &iommu, &domain, &devid); 1103 get_device_resources(dev, &iommu, &domain, &devid);
914 1104
@@ -921,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
921 paddr = sg_phys(s); 1111 paddr = sg_phys(s);
922 1112
923 s->dma_address = __map_single(dev, iommu, domain->priv, 1113 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir); 1114 paddr, s->length, dir, false,
1115 dma_mask);
925 1116
926 if (s->dma_address) { 1117 if (s->dma_address) {
927 s->dma_length = s->length; 1118 s->dma_length = s->length;
928 mapped_elems++; 1119 mapped_elems++;
929 } else 1120 } else
930 goto unmap; 1121 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 } 1122 }
935 1123
936 if (iommu->need_sync) 1124 if (unlikely(iommu->need_sync))
937 iommu_completion_wait(iommu); 1125 iommu_completion_wait(iommu);
938 1126
939out: 1127out:
@@ -967,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
967 u16 devid; 1155 u16 devid;
968 int i; 1156 int i;
969 1157
970 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
971 return; 1160 return;
972 1161
973 spin_lock_irqsave(&domain->lock, flags); 1162 spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
975 for_each_sg(sglist, s, nelems, i) { 1164 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address, 1165 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir); 1166 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0; 1167 s->dma_address = s->dma_length = 0;
981 } 1168 }
982 1169
983 if (iommu->need_sync) 1170 if (unlikely(iommu->need_sync))
984 iommu_completion_wait(iommu); 1171 iommu_completion_wait(iommu);
985 1172
986 spin_unlock_irqrestore(&domain->lock, flags); 1173 spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
998 struct protection_domain *domain; 1185 struct protection_domain *domain;
999 u16 devid; 1186 u16 devid;
1000 phys_addr_t paddr; 1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1001 1195
1196 flag |= __GFP_ZERO;
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr) 1198 if (!virt_addr)
1004 return 0; 1199 return 0;
1005 1200
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr); 1201 paddr = virt_to_phys(virt_addr);
1008 1202
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) { 1203 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr; 1204 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr; 1205 return virt_addr;
1014 } 1206 }
1015 1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1016 spin_lock_irqsave(&domain->lock, flags); 1211 spin_lock_irqsave(&domain->lock, flags);
1017 1212
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL); 1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1020 1215
1021 if (*dma_addr == bad_dma_address) { 1216 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size)); 1217 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1024 goto out; 1219 goto out;
1025 } 1220 }
1026 1221
1027 if (iommu_has_npcache(iommu)) 1222 if (unlikely(iommu->need_sync))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu); 1223 iommu_completion_wait(iommu);
1032 1224
1033out: 1225out:
@@ -1038,8 +1230,6 @@ out:
1038 1230
1039/* 1231/*
1040 * The exported free_coherent function for dma_ops. 1232 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */ 1233 */
1044static void free_coherent(struct device *dev, size_t size, 1234static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr) 1235 void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
1049 struct protection_domain *domain; 1239 struct protection_domain *domain;
1050 u16 devid; 1240 u16 devid;
1051 1241
1242 if (!check_device(dev))
1243 return;
1244
1052 get_device_resources(dev, &iommu, &domain, &devid); 1245 get_device_resources(dev, &iommu, &domain, &devid);
1053 1246
1054 if (!iommu || !domain) 1247 if (!iommu || !domain)
@@ -1057,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
1057 spin_lock_irqsave(&domain->lock, flags); 1250 spin_lock_irqsave(&domain->lock, flags);
1058 1251
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061 1253
1062 if (iommu->need_sync) 1254 if (unlikely(iommu->need_sync))
1063 iommu_completion_wait(iommu); 1255 iommu_completion_wait(iommu);
1064 1256
1065 spin_unlock_irqrestore(&domain->lock, flags); 1257 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1261,30 @@ free_mem:
1069} 1261}
1070 1262
1071/* 1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1072 * The function for pre-allocating protection domains. 1288 * The function for pre-allocating protection domains.
1073 * 1289 *
1074 * If the driver core informs the DMA layer if a driver grabs a device 1290 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1085,7 +1301,7 @@ void prealloc_protection_domains(void)
1085 1301
1086 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1302 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1087 devid = (dev->bus->number << 8) | dev->devfn; 1303 devid = (dev->bus->number << 8) | dev->devfn;
1088 if (devid >= amd_iommu_last_bdf) 1304 if (devid > amd_iommu_last_bdf)
1089 continue; 1305 continue;
1090 devid = amd_iommu_alias_table[devid]; 1306 devid = amd_iommu_alias_table[devid];
1091 if (domain_for_device(devid)) 1307 if (domain_for_device(devid))
@@ -1097,10 +1313,9 @@ void prealloc_protection_domains(void)
1097 if (!dma_dom) 1313 if (!dma_dom)
1098 continue; 1314 continue;
1099 init_unity_mappings_for_device(dma_dom, devid); 1315 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid); 1316 dma_dom->target_dev = devid;
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1317
1102 dma_dom->domain.id); 1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1103 print_devid(devid, 1);
1104 } 1319 }
1105} 1320}
1106 1321
@@ -1111,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1111 .unmap_single = unmap_single, 1326 .unmap_single = unmap_single,
1112 .map_sg = map_sg, 1327 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg, 1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1114}; 1330};
1115 1331
1116/* 1332/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c9d8ff2eb130..4cd8083c58be 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,11 +713,17 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
668 return 0; 726 return pci_enable_device(iommu->dev);
669} 727}
670 728
671/* 729/*
@@ -706,6 +764,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 764
707/**************************************************************************** 765/****************************************************************************
708 * 766 *
767 * The following functions initialize the MSI interrupts for all IOMMUs
768 * in the system. Its a bit challenging because there could be multiple
769 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
770 * pci_dev.
771 *
772 ****************************************************************************/
773
774static int __init iommu_setup_msix(struct amd_iommu *iommu)
775{
776 struct amd_iommu *curr;
777 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
778 int nvec = 0, i;
779
780 list_for_each_entry(curr, &amd_iommu_list, list) {
781 if (curr->dev == iommu->dev) {
782 entries[nvec].entry = curr->evt_msi_num;
783 entries[nvec].vector = 0;
784 curr->int_enabled = true;
785 nvec++;
786 }
787 }
788
789 if (pci_enable_msix(iommu->dev, entries, nvec)) {
790 pci_disable_msix(iommu->dev);
791 return 1;
792 }
793
794 for (i = 0; i < nvec; ++i) {
795 int r = request_irq(entries->vector, amd_iommu_int_handler,
796 IRQF_SAMPLE_RANDOM,
797 "AMD IOMMU",
798 NULL);
799 if (r)
800 goto out_free;
801 }
802
803 return 0;
804
805out_free:
806 for (i -= 1; i >= 0; --i)
807 free_irq(entries->vector, NULL);
808
809 pci_disable_msix(iommu->dev);
810
811 return 1;
812}
813
814static int __init iommu_setup_msi(struct amd_iommu *iommu)
815{
816 int r;
817 struct amd_iommu *curr;
818
819 list_for_each_entry(curr, &amd_iommu_list, list) {
820 if (curr->dev == iommu->dev)
821 curr->int_enabled = true;
822 }
823
824
825 if (pci_enable_msi(iommu->dev))
826 return 1;
827
828 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
829 IRQF_SAMPLE_RANDOM,
830 "AMD IOMMU",
831 NULL);
832
833 if (r) {
834 pci_disable_msi(iommu->dev);
835 return 1;
836 }
837
838 return 0;
839}
840
841static int __init iommu_init_msi(struct amd_iommu *iommu)
842{
843 if (iommu->int_enabled)
844 return 0;
845
846 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
847 return iommu_setup_msix(iommu);
848 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msi(iommu);
850
851 return 1;
852}
853
854/****************************************************************************
855 *
709 * The next functions belong to the third pass of parsing the ACPI 856 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 857 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 858 * gathered (like exclusion and unity mapping reanges).
@@ -732,7 +879,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
732 set_device_exclusion_range(m->devid, m); 879 set_device_exclusion_range(m->devid, m);
733 break; 880 break;
734 case ACPI_IVMD_TYPE_ALL: 881 case ACPI_IVMD_TYPE_ALL:
735 for (i = 0; i < amd_iommu_last_bdf; ++i) 882 for (i = 0; i <= amd_iommu_last_bdf; ++i)
736 set_device_exclusion_range(i, m); 883 set_device_exclusion_range(i, m);
737 break; 884 break;
738 case ACPI_IVMD_TYPE_RANGE: 885 case ACPI_IVMD_TYPE_RANGE:
@@ -801,6 +948,20 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
801} 948}
802 949
803/* 950/*
951 * Init the device table to not allow DMA access for devices and
952 * suppress all page faults
953 */
954static void init_device_table(void)
955{
956 u16 devid;
957
958 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
959 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
960 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
961 }
962}
963
964/*
804 * This function finally enables all IOMMUs found in the system after 965 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized 966 * they have been initialized
806 */ 967 */
@@ -810,6 +971,8 @@ static void __init enable_iommus(void)
810 971
811 list_for_each_entry(iommu, &amd_iommu_list, list) { 972 list_for_each_entry(iommu, &amd_iommu_list, list) {
812 iommu_set_exclusion_range(iommu); 973 iommu_set_exclusion_range(iommu);
974 iommu_init_msi(iommu);
975 iommu_enable_event_logging(iommu);
813 iommu_enable(iommu); 976 iommu_enable(iommu);
814 } 977 }
815} 978}
@@ -931,10 +1094,13 @@ int __init amd_iommu_init(void)
931 if (amd_iommu_pd_alloc_bitmap == NULL) 1094 if (amd_iommu_pd_alloc_bitmap == NULL)
932 goto free; 1095 goto free;
933 1096
1097 /* init the device table */
1098 init_device_table();
1099
934 /* 1100 /*
935 * let all alias entries point to itself 1101 * let all alias entries point to itself
936 */ 1102 */
937 for (i = 0; i < amd_iommu_last_bdf; ++i) 1103 for (i = 0; i <= amd_iommu_last_bdf; ++i)
938 amd_iommu_alias_table[i] = i; 1104 amd_iommu_alias_table[i] = i;
939 1105
940 /* 1106 /*
@@ -954,15 +1120,15 @@ int __init amd_iommu_init(void)
954 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1120 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
955 goto free; 1121 goto free;
956 1122
957 ret = amd_iommu_init_dma_ops(); 1123 ret = sysdev_class_register(&amd_iommu_sysdev_class);
958 if (ret) 1124 if (ret)
959 goto free; 1125 goto free;
960 1126
961 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1127 ret = sysdev_register(&device_amd_iommu);
962 if (ret) 1128 if (ret)
963 goto free; 1129 goto free;
964 1130
965 ret = sysdev_register(&device_amd_iommu); 1131 ret = amd_iommu_init_dma_ops();
966 if (ret) 1132 if (ret)
967 goto free; 1133 goto free;
968 1134
@@ -977,11 +1143,17 @@ int __init amd_iommu_init(void)
977 else 1143 else
978 printk("disabled\n"); 1144 printk("disabled\n");
979 1145
1146 if (amd_iommu_unmap_flush)
1147 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1148 else
1149 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1150
980out: 1151out:
981 return ret; 1152 return ret;
982 1153
983free: 1154free:
984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1155 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1156 get_order(MAX_DOMAIN_ID/8));
985 1157
986 free_pages((unsigned long)amd_iommu_pd_table, 1158 free_pages((unsigned long)amd_iommu_pd_table,
987 get_order(rlookup_table_size)); 1159 get_order(rlookup_table_size));
@@ -1039,8 +1211,10 @@ void __init amd_iommu_detect(void)
1039static int __init parse_amd_iommu_options(char *str) 1211static int __init parse_amd_iommu_options(char *str)
1040{ 1212{
1041 for (; *str; ++str) { 1213 for (; *str; ++str) {
1042 if (strcmp(str, "isolate") == 0) 1214 if (strncmp(str, "isolate", 7) == 0)
1043 amd_iommu_isolate = 1; 1215 amd_iommu_isolate = 1;
1216 if (strncmp(str, "fullflush", 11) == 0)
1217 amd_iommu_unmap_flush = true;
1044 } 1218 }
1045 1219
1046 return 1; 1220 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 0059e7a8a9e6..21c831d96af3 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
60static int force_enable_local_apic; 60static int force_enable_local_apic;
61int disable_apic; 61int disable_apic;
62 62
63/* Local APIC timer verification ok */
64static int local_apic_timer_verify_ok;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 64static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
130 */ 128 */
131static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
132{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
133 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
134} 136}
135 137
136/* 138/*
@@ -145,13 +147,18 @@ static int modern_apic(void)
145 return lapic_get_version() >= 0x14; 147 return lapic_get_version() >= 0x14;
146} 148}
147 149
148void apic_wait_icr_idle(void) 150/*
151 * Paravirt kernels also might be using these below ops. So we still
152 * use generic apic_read()/apic_write(), which might be pointing to different
153 * ops in PARAVIRT case.
154 */
155void xapic_wait_icr_idle(void)
149{ 156{
150 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 157 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
151 cpu_relax(); 158 cpu_relax();
152} 159}
153 160
154u32 safe_apic_wait_icr_idle(void) 161u32 safe_xapic_wait_icr_idle(void)
155{ 162{
156 u32 send_status; 163 u32 send_status;
157 int timeout; 164 int timeout;
@@ -167,16 +174,48 @@ u32 safe_apic_wait_icr_idle(void)
167 return send_status; 174 return send_status;
168} 175}
169 176
177void xapic_icr_write(u32 low, u32 id)
178{
179 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
180 apic_write(APIC_ICR, low);
181}
182
183u64 xapic_icr_read(void)
184{
185 u32 icr1, icr2;
186
187 icr2 = apic_read(APIC_ICR2);
188 icr1 = apic_read(APIC_ICR);
189
190 return icr1 | ((u64)icr2 << 32);
191}
192
193static struct apic_ops xapic_ops = {
194 .read = native_apic_mem_read,
195 .write = native_apic_mem_write,
196 .icr_read = xapic_icr_read,
197 .icr_write = xapic_icr_write,
198 .wait_icr_idle = xapic_wait_icr_idle,
199 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
200};
201
202struct apic_ops __read_mostly *apic_ops = &xapic_ops;
203EXPORT_SYMBOL_GPL(apic_ops);
204
170/** 205/**
171 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 206 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
172 */ 207 */
173void __cpuinit enable_NMI_through_LVT0(void) 208void __cpuinit enable_NMI_through_LVT0(void)
174{ 209{
175 unsigned int v = APIC_DM_NMI; 210 unsigned int v;
176 211
177 /* Level triggered for 82489DX */ 212 /* unmask and set to NMI */
213 v = APIC_DM_NMI;
214
215 /* Level triggered for 82489DX (32bit mode) */
178 if (!lapic_is_integrated()) 216 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 217 v |= APIC_LVT_LEVEL_TRIGGER;
218
180 apic_write(APIC_LVT0, v); 219 apic_write(APIC_LVT0, v);
181} 220}
182 221
@@ -193,9 +232,13 @@ int get_physical_broadcast(void)
193 */ 232 */
194int lapic_get_maxlvt(void) 233int lapic_get_maxlvt(void)
195{ 234{
196 unsigned int v = apic_read(APIC_LVR); 235 unsigned int v;
197 236
198 /* 82489DXs do not report # of LVT entries. */ 237 v = apic_read(APIC_LVR);
238 /*
239 * - we always have APIC integrated on 64bit mode
240 * - 82489DXs do not report # of LVT entries
241 */
199 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 242 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
200} 243}
201 244
@@ -203,8 +246,12 @@ int lapic_get_maxlvt(void)
203 * Local APIC timer 246 * Local APIC timer
204 */ 247 */
205 248
206/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
207#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
208 255
209/* 256/*
210 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -212,6 +259,9 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 259 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 260 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 261 * call this function only once, with the real, calibrated value.
262 *
263 * We do reads before writes even if unnecessary, to get around the
264 * P5 APIC double write bug.
215 */ 265 */
216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 266static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
217{ 267{
@@ -233,14 +283,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
233 */ 283 */
234 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
235 apic_write(APIC_TDCR, 285 apic_write(APIC_TDCR,
236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
237 APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
238 288
239 if (!oneshot) 289 if (!oneshot)
240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
241} 291}
242 292
243/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 *
299 * If mask=1, the LVT entry does not generate interrupts while mask=0
300 * enables the vector. See also the BKDGs.
301 */
302
303#define APIC_EILVT_LVTOFF_MCE 0
304#define APIC_EILVT_LVTOFF_IBS 1
305
306static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
307{
308 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
309 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
310
311 apic_write(reg, v);
312}
313
314u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
315{
316 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
317 return APIC_EILVT_LVTOFF_MCE;
318}
319
320u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
321{
322 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
323 return APIC_EILVT_LVTOFF_IBS;
324}
325EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
326
327/*
244 * Program the next event, relative to now 328 * Program the next event, relative to now
245 */ 329 */
246static int lapic_next_event(unsigned long delta, 330static int lapic_next_event(unsigned long delta,
@@ -259,8 +343,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
259 unsigned long flags; 343 unsigned long flags;
260 unsigned int v; 344 unsigned int v;
261 345
262 /* Lapic used for broadcast ? */ 346 /* Lapic used as dummy for broadcast ? */
263 if (!local_apic_timer_verify_ok) 347 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
264 return; 348 return;
265 349
266 local_irq_save(flags); 350 local_irq_save(flags);
@@ -473,7 +557,7 @@ static int __init calibrate_APIC_clock(void)
473 return -1; 557 return -1;
474 } 558 }
475 559
476 local_apic_timer_verify_ok = 1; 560 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
477 561
478 /* We trust the pm timer based calibration */ 562 /* We trust the pm timer based calibration */
479 if (!pm_referenced) { 563 if (!pm_referenced) {
@@ -507,11 +591,11 @@ static int __init calibrate_APIC_clock(void)
507 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 591 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
508 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 592 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
509 else 593 else
510 local_apic_timer_verify_ok = 0; 594 levt->features |= CLOCK_EVT_FEAT_DUMMY;
511 } else 595 } else
512 local_irq_enable(); 596 local_irq_enable();
513 597
514 if (!local_apic_timer_verify_ok) { 598 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
515 printk(KERN_WARNING 599 printk(KERN_WARNING
516 "APIC timer disabled due to verification failure.\n"); 600 "APIC timer disabled due to verification failure.\n");
517 return -1; 601 return -1;
@@ -533,7 +617,8 @@ void __init setup_boot_APIC_clock(void)
533 * timer as a dummy clock event source on SMP systems, so the 617 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it. 618 * broadcast mechanism is used. On UP systems simply ignore it.
535 */ 619 */
536 if (local_apic_timer_disabled) { 620 if (disable_apic_timer) {
621 printk(KERN_INFO "Disabling APIC timer\n");
537 /* No broadcast on UP ! */ 622 /* No broadcast on UP ! */
538 if (num_possible_cpus() > 1) { 623 if (num_possible_cpus() > 1) {
539 lapic_clockevent.mult = 1; 624 lapic_clockevent.mult = 1;
@@ -602,7 +687,11 @@ static void local_apic_timer_interrupt(void)
602 /* 687 /*
603 * the NMI deadlock-detector uses this. 688 * the NMI deadlock-detector uses this.
604 */ 689 */
690#ifdef CONFIG_X86_64
691 add_pda(apic_timer_irqs, 1);
692#else
605 per_cpu(irq_stat, cpu).apic_timer_irqs++; 693 per_cpu(irq_stat, cpu).apic_timer_irqs++;
694#endif
606 695
607 evt->event_handler(evt); 696 evt->event_handler(evt);
608} 697}
@@ -642,39 +731,6 @@ int setup_profiling_timer(unsigned int multiplier)
642} 731}
643 732
644/* 733/*
645 * Setup extended LVT, AMD specific (K8, family 10h)
646 *
647 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
648 * MCE interrupts are supported. Thus MCE offset must be set to 0.
649 *
650 * If mask=1, the LVT entry does not generate interrupts while mask=0
651 * enables the vector. See also the BKDGs.
652 */
653
654#define APIC_EILVT_LVTOFF_MCE 0
655#define APIC_EILVT_LVTOFF_IBS 1
656
657static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
658{
659 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
660 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
661 apic_write(reg, v);
662}
663
664u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
665{
666 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
667 return APIC_EILVT_LVTOFF_MCE;
668}
669
670u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
671{
672 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
673 return APIC_EILVT_LVTOFF_IBS;
674}
675EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
676
677/*
678 * Local APIC start and shutdown 734 * Local APIC start and shutdown
679 */ 735 */
680 736
@@ -719,7 +775,7 @@ void clear_local_APIC(void)
719 } 775 }
720 776
721 /* lets not touch this if we didn't frob it */ 777 /* lets not touch this if we didn't frob it */
722#ifdef CONFIG_X86_MCE_P4THERMAL 778#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
723 if (maxlvt >= 5) { 779 if (maxlvt >= 5) {
724 v = apic_read(APIC_LVTTHMR); 780 v = apic_read(APIC_LVTTHMR);
725 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 781 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -736,10 +792,6 @@ void clear_local_APIC(void)
736 if (maxlvt >= 4) 792 if (maxlvt >= 4)
737 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 793 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
738 794
739#ifdef CONFIG_X86_MCE_P4THERMAL
740 if (maxlvt >= 5)
741 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
742#endif
743 /* Integrated APIC (!82489DX) ? */ 795 /* Integrated APIC (!82489DX) ? */
744 if (lapic_is_integrated()) { 796 if (lapic_is_integrated()) {
745 if (maxlvt > 3) 797 if (maxlvt > 3)
@@ -754,7 +806,7 @@ void clear_local_APIC(void)
754 */ 806 */
755void disable_local_APIC(void) 807void disable_local_APIC(void)
756{ 808{
757 unsigned long value; 809 unsigned int value;
758 810
759 clear_local_APIC(); 811 clear_local_APIC();
760 812
@@ -766,6 +818,7 @@ void disable_local_APIC(void)
766 value &= ~APIC_SPIV_APIC_ENABLED; 818 value &= ~APIC_SPIV_APIC_ENABLED;
767 apic_write(APIC_SPIV, value); 819 apic_write(APIC_SPIV, value);
768 820
821#ifdef CONFIG_X86_32
769 /* 822 /*
770 * When LAPIC was disabled by the BIOS and enabled by the kernel, 823 * When LAPIC was disabled by the BIOS and enabled by the kernel,
771 * restore the disabled state. 824 * restore the disabled state.
@@ -777,6 +830,7 @@ void disable_local_APIC(void)
777 l &= ~MSR_IA32_APICBASE_ENABLE; 830 l &= ~MSR_IA32_APICBASE_ENABLE;
778 wrmsr(MSR_IA32_APICBASE, l, h); 831 wrmsr(MSR_IA32_APICBASE, l, h);
779 } 832 }
833#endif
780} 834}
781 835
782/* 836/*
@@ -793,11 +847,15 @@ void lapic_shutdown(void)
793 return; 847 return;
794 848
795 local_irq_save(flags); 849 local_irq_save(flags);
796 clear_local_APIC();
797 850
798 if (enabled_via_apicbase) 851#ifdef CONFIG_X86_32
852 if (!enabled_via_apicbase)
853 clear_local_APIC();
854 else
855#endif
799 disable_local_APIC(); 856 disable_local_APIC();
800 857
858
801 local_irq_restore(flags); 859 local_irq_restore(flags);
802} 860}
803 861
@@ -842,6 +900,12 @@ int __init verify_local_APIC(void)
842 */ 900 */
843 reg0 = apic_read(APIC_ID); 901 reg0 = apic_read(APIC_ID);
844 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 902 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
903 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
904 reg1 = apic_read(APIC_ID);
905 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
906 apic_write(APIC_ID, reg0);
907 if (reg1 != (reg0 ^ APIC_ID_MASK))
908 return 0;
845 909
846 /* 910 /*
847 * The next two are just to see if we have sane values. 911 * The next two are just to see if we have sane values.
@@ -867,14 +931,15 @@ void __init sync_Arb_IDs(void)
867 */ 931 */
868 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 932 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
869 return; 933 return;
934
870 /* 935 /*
871 * Wait for idle. 936 * Wait for idle.
872 */ 937 */
873 apic_wait_icr_idle(); 938 apic_wait_icr_idle();
874 939
875 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 940 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
876 apic_write(APIC_ICR, 941 apic_write(APIC_ICR, APIC_DEST_ALLINC |
877 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 942 APIC_INT_LEVELTRIG | APIC_DM_INIT);
878} 943}
879 944
880/* 945/*
@@ -882,7 +947,7 @@ void __init sync_Arb_IDs(void)
882 */ 947 */
883void __init init_bsp_APIC(void) 948void __init init_bsp_APIC(void)
884{ 949{
885 unsigned long value; 950 unsigned int value;
886 951
887 /* 952 /*
888 * Don't do the setup now if we have a SMP BIOS as the 953 * Don't do the setup now if we have a SMP BIOS as the
@@ -903,11 +968,13 @@ void __init init_bsp_APIC(void)
903 value &= ~APIC_VECTOR_MASK; 968 value &= ~APIC_VECTOR_MASK;
904 value |= APIC_SPIV_APIC_ENABLED; 969 value |= APIC_SPIV_APIC_ENABLED;
905 970
971#ifdef CONFIG_X86_32
906 /* This bit is reserved on P4/Xeon and should be cleared */ 972 /* This bit is reserved on P4/Xeon and should be cleared */
907 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 973 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
908 (boot_cpu_data.x86 == 15)) 974 (boot_cpu_data.x86 == 15))
909 value &= ~APIC_SPIV_FOCUS_DISABLED; 975 value &= ~APIC_SPIV_FOCUS_DISABLED;
910 else 976 else
977#endif
911 value |= APIC_SPIV_FOCUS_DISABLED; 978 value |= APIC_SPIV_FOCUS_DISABLED;
912 value |= SPURIOUS_APIC_VECTOR; 979 value |= SPURIOUS_APIC_VECTOR;
913 apic_write(APIC_SPIV, value); 980 apic_write(APIC_SPIV, value);
@@ -926,6 +993,16 @@ static void __cpuinit lapic_setup_esr(void)
926{ 993{
927 unsigned long oldvalue, value, maxlvt; 994 unsigned long oldvalue, value, maxlvt;
928 if (lapic_is_integrated() && !esr_disable) { 995 if (lapic_is_integrated() && !esr_disable) {
996 if (esr_disable) {
997 /*
998 * Something untraceable is creating bad interrupts on
999 * secondary quads ... for the moment, just leave the
1000 * ESR disabled - we can't do anything useful with the
1001 * errors anyway - mbligh
1002 */
1003 printk(KERN_INFO "Leaving ESR disabled.\n");
1004 return;
1005 }
929 /* !82489DX */ 1006 /* !82489DX */
930 maxlvt = lapic_get_maxlvt(); 1007 maxlvt = lapic_get_maxlvt();
931 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1008 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -946,16 +1023,7 @@ static void __cpuinit lapic_setup_esr(void)
946 "vector: 0x%08lx after: 0x%08lx\n", 1023 "vector: 0x%08lx after: 0x%08lx\n",
947 oldvalue, value); 1024 oldvalue, value);
948 } else { 1025 } else {
949 if (esr_disable) 1026 printk(KERN_INFO "No ESR for 82489DX.\n");
950 /*
951 * Something untraceable is creating bad interrupts on
952 * secondary quads ... for the moment, just leave the
953 * ESR disabled - we can't do anything useful with the
954 * errors anyway - mbligh
955 */
956 printk(KERN_INFO "Leaving ESR disabled.\n");
957 else
958 printk(KERN_INFO "No ESR for 82489DX.\n");
959 } 1027 }
960} 1028}
961 1029
@@ -1093,13 +1161,17 @@ void __cpuinit setup_local_APIC(void)
1093 1161
1094void __cpuinit end_local_APIC_setup(void) 1162void __cpuinit end_local_APIC_setup(void)
1095{ 1163{
1096 unsigned long value;
1097
1098 lapic_setup_esr(); 1164 lapic_setup_esr();
1099 /* Disable the local apic timer */ 1165
1100 value = apic_read(APIC_LVTT); 1166#ifdef CONFIG_X86_32
1101 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1167 {
1102 apic_write(APIC_LVTT, value); 1168 unsigned int value;
1169 /* Disable the local apic timer */
1170 value = apic_read(APIC_LVTT);
1171 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1172 apic_write(APIC_LVTT, value);
1173 }
1174#endif
1103 1175
1104 setup_apic_nmi_watchdog(NULL); 1176 setup_apic_nmi_watchdog(NULL);
1105 apic_pm_activate(); 1177 apic_pm_activate();
@@ -1209,7 +1281,7 @@ void __init init_apic_mappings(void)
1209 * default configuration (or the MP table is broken). 1281 * default configuration (or the MP table is broken).
1210 */ 1282 */
1211 if (boot_cpu_physical_apicid == -1U) 1283 if (boot_cpu_physical_apicid == -1U)
1212 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1284 boot_cpu_physical_apicid = read_apic_id();
1213 1285
1214} 1286}
1215 1287
@@ -1246,7 +1318,7 @@ int __init APIC_init_uniprocessor(void)
1246 * might be zero if read from MP tables. Get it from LAPIC. 1318 * might be zero if read from MP tables. Get it from LAPIC.
1247 */ 1319 */
1248#ifdef CONFIG_CRASH_DUMP 1320#ifdef CONFIG_CRASH_DUMP
1249 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1321 boot_cpu_physical_apicid = read_apic_id();
1250#endif 1322#endif
1251 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1323 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1252 1324
@@ -1325,59 +1397,12 @@ void smp_error_interrupt(struct pt_regs *regs)
1325 irq_exit(); 1397 irq_exit();
1326} 1398}
1327 1399
1328#ifdef CONFIG_SMP
1329void __init smp_intr_init(void)
1330{
1331 /*
1332 * IRQ0 must be given a fixed assignment and initialized,
1333 * because it's used before the IO-APIC is set up.
1334 */
1335 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1336
1337 /*
1338 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1339 * IPI, driven by wakeup.
1340 */
1341 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1342
1343 /* IPI for invalidation */
1344 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1345
1346 /* IPI for generic function call */
1347 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1348
1349 /* IPI for single call function */
1350 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1351 call_function_single_interrupt);
1352}
1353#endif
1354
1355/*
1356 * Initialize APIC interrupts
1357 */
1358void __init apic_intr_init(void)
1359{
1360#ifdef CONFIG_SMP
1361 smp_intr_init();
1362#endif
1363 /* self generated IPI for local APIC timer */
1364 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1365
1366 /* IPI vectors for APIC spurious and error interrupts */
1367 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1368 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1369
1370 /* thermal monitor LVT interrupt */
1371#ifdef CONFIG_X86_MCE_P4THERMAL
1372 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1373#endif
1374}
1375
1376/** 1400/**
1377 * connect_bsp_APIC - attach the APIC to the interrupt system 1401 * connect_bsp_APIC - attach the APIC to the interrupt system
1378 */ 1402 */
1379void __init connect_bsp_APIC(void) 1403void __init connect_bsp_APIC(void)
1380{ 1404{
1405#ifdef CONFIG_X86_32
1381 if (pic_mode) { 1406 if (pic_mode) {
1382 /* 1407 /*
1383 * Do not trust the local APIC being empty at bootup. 1408 * Do not trust the local APIC being empty at bootup.
@@ -1392,6 +1417,7 @@ void __init connect_bsp_APIC(void)
1392 outb(0x70, 0x22); 1417 outb(0x70, 0x22);
1393 outb(0x01, 0x23); 1418 outb(0x01, 0x23);
1394 } 1419 }
1420#endif
1395 enable_apic_mode(); 1421 enable_apic_mode();
1396} 1422}
1397 1423
@@ -1404,6 +1430,9 @@ void __init connect_bsp_APIC(void)
1404 */ 1430 */
1405void disconnect_bsp_APIC(int virt_wire_setup) 1431void disconnect_bsp_APIC(int virt_wire_setup)
1406{ 1432{
1433 unsigned int value;
1434
1435#ifdef CONFIG_X86_32
1407 if (pic_mode) { 1436 if (pic_mode) {
1408 /* 1437 /*
1409 * Put the board back into PIC mode (has an effect only on 1438 * Put the board back into PIC mode (has an effect only on
@@ -1415,56 +1444,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1415 "entering PIC mode.\n"); 1444 "entering PIC mode.\n");
1416 outb(0x70, 0x22); 1445 outb(0x70, 0x22);
1417 outb(0x00, 0x23); 1446 outb(0x00, 0x23);
1418 } else { 1447 return;
1419 /* Go back to Virtual Wire compatibility mode */ 1448 }
1420 unsigned long value; 1449#endif
1421 1450
1422 /* For the spurious interrupt use vector F, and enable it */ 1451 /* Go back to Virtual Wire compatibility mode */
1423 value = apic_read(APIC_SPIV);
1424 value &= ~APIC_VECTOR_MASK;
1425 value |= APIC_SPIV_APIC_ENABLED;
1426 value |= 0xf;
1427 apic_write(APIC_SPIV, value);
1428 1452
1429 if (!virt_wire_setup) { 1453 /* For the spurious interrupt use vector F, and enable it */
1430 /* 1454 value = apic_read(APIC_SPIV);
1431 * For LVT0 make it edge triggered, active high, 1455 value &= ~APIC_VECTOR_MASK;
1432 * external and enabled 1456 value |= APIC_SPIV_APIC_ENABLED;
1433 */ 1457 value |= 0xf;
1434 value = apic_read(APIC_LVT0); 1458 apic_write(APIC_SPIV, value);
1435 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1436 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1437 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1438 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1439 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1440 apic_write(APIC_LVT0, value);
1441 } else {
1442 /* Disable LVT0 */
1443 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1444 }
1445 1459
1460 if (!virt_wire_setup) {
1446 /* 1461 /*
1447 * For LVT1 make it edge triggered, active high, nmi and 1462 * For LVT0 make it edge triggered, active high,
1448 * enabled 1463 * external and enabled
1449 */ 1464 */
1450 value = apic_read(APIC_LVT1); 1465 value = apic_read(APIC_LVT0);
1451 value &= ~( 1466 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1452 APIC_MODE_MASK | APIC_SEND_PENDING |
1453 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1467 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1454 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1468 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1455 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1469 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1456 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1470 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1457 apic_write(APIC_LVT1, value); 1471 apic_write(APIC_LVT0, value);
1472 } else {
1473 /* Disable LVT0 */
1474 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1458 } 1475 }
1459}
1460 1476
1461unsigned int __cpuinitdata maxcpus = NR_CPUS; 1477 /*
1478 * For LVT1 make it edge triggered, active high,
1479 * nmi and enabled
1480 */
1481 value = apic_read(APIC_LVT1);
1482 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1483 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1484 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1485 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1486 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1487 apic_write(APIC_LVT1, value);
1488}
1462 1489
1463void __cpuinit generic_processor_info(int apicid, int version) 1490void __cpuinit generic_processor_info(int apicid, int version)
1464{ 1491{
1465 int cpu; 1492 int cpu;
1466 cpumask_t tmp_map; 1493 cpumask_t tmp_map;
1467 physid_mask_t phys_cpu;
1468 1494
1469 /* 1495 /*
1470 * Validate version 1496 * Validate version
@@ -1477,36 +1503,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1477 } 1503 }
1478 apic_version[apicid] = version; 1504 apic_version[apicid] = version;
1479 1505
1480 phys_cpu = apicid_to_cpu_present(apicid);
1481 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1482
1483 if (num_processors >= NR_CPUS) { 1506 if (num_processors >= NR_CPUS) {
1484 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1507 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1485 " Processor ignored.\n", NR_CPUS); 1508 " Processor ignored.\n", NR_CPUS);
1486 return; 1509 return;
1487 } 1510 }
1488 1511
1489 if (num_processors >= maxcpus) {
1490 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1491 " Processor ignored.\n", maxcpus);
1492 return;
1493 }
1494
1495 num_processors++; 1512 num_processors++;
1496 cpus_complement(tmp_map, cpu_present_map); 1513 cpus_complement(tmp_map, cpu_present_map);
1497 cpu = first_cpu(tmp_map); 1514 cpu = first_cpu(tmp_map);
1498 1515
1499 if (apicid == boot_cpu_physical_apicid) 1516 physid_set(apicid, phys_cpu_present_map);
1517 if (apicid == boot_cpu_physical_apicid) {
1500 /* 1518 /*
1501 * x86_bios_cpu_apicid is required to have processors listed 1519 * x86_bios_cpu_apicid is required to have processors listed
1502 * in same order as logical cpu numbers. Hence the first 1520 * in same order as logical cpu numbers. Hence the first
1503 * entry is BSP, and so on. 1521 * entry is BSP, and so on.
1504 */ 1522 */
1505 cpu = 0; 1523 cpu = 0;
1506 1524 }
1507 if (apicid > max_physical_apicid) 1525 if (apicid > max_physical_apicid)
1508 max_physical_apicid = apicid; 1526 max_physical_apicid = apicid;
1509 1527
1528#ifdef CONFIG_X86_32
1510 /* 1529 /*
1511 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1530 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1512 * but we need to work other dependencies like SMP_SUSPEND etc 1531 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1526,7 +1545,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1526 def_to_bigsmp = 1; 1545 def_to_bigsmp = 1;
1527 } 1546 }
1528 } 1547 }
1529#ifdef CONFIG_SMP 1548#endif
1549
1550#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1530 /* are we being called early in kernel startup? */ 1551 /* are we being called early in kernel startup? */
1531 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1552 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1532 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1553 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1539,6 +1560,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1539 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1560 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1540 } 1561 }
1541#endif 1562#endif
1563
1542 cpu_set(cpu, cpu_possible_map); 1564 cpu_set(cpu, cpu_possible_map);
1543 cpu_set(cpu, cpu_present_map); 1565 cpu_set(cpu, cpu_present_map);
1544} 1566}
@@ -1549,6 +1571,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1549#ifdef CONFIG_PM 1571#ifdef CONFIG_PM
1550 1572
1551static struct { 1573static struct {
1574 /*
1575 * 'active' is true if the local APIC was enabled by us and
1576 * not the BIOS; this signifies that we are also responsible
1577 * for disabling it before entering apm/acpi suspend
1578 */
1552 int active; 1579 int active;
1553 /* r/w apic fields */ 1580 /* r/w apic fields */
1554 unsigned int apic_id; 1581 unsigned int apic_id;
@@ -1589,7 +1616,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1589 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1616 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1590 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1617 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1591 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1618 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1592#ifdef CONFIG_X86_MCE_P4THERMAL 1619#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1593 if (maxlvt >= 5) 1620 if (maxlvt >= 5)
1594 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1621 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1595#endif 1622#endif
@@ -1613,16 +1640,23 @@ static int lapic_resume(struct sys_device *dev)
1613 1640
1614 local_irq_save(flags); 1641 local_irq_save(flags);
1615 1642
1616 /* 1643#ifdef CONFIG_X86_64
1617 * Make sure the APICBASE points to the right address 1644 if (x2apic)
1618 * 1645 enable_x2apic();
1619 * FIXME! This will be wrong if we ever support suspend on 1646 else
1620 * SMP! We'll need to do this as part of the CPU restore! 1647#endif
1621 */ 1648 {
1622 rdmsr(MSR_IA32_APICBASE, l, h); 1649 /*
1623 l &= ~MSR_IA32_APICBASE_BASE; 1650 * Make sure the APICBASE points to the right address
1624 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1651 *
1625 wrmsr(MSR_IA32_APICBASE, l, h); 1652 * FIXME! This will be wrong if we ever support suspend on
1653 * SMP! We'll need to do this as part of the CPU restore!
1654 */
1655 rdmsr(MSR_IA32_APICBASE, l, h);
1656 l &= ~MSR_IA32_APICBASE_BASE;
1657 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1658 wrmsr(MSR_IA32_APICBASE, l, h);
1659 }
1626 1660
1627 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1661 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1628 apic_write(APIC_ID, apic_pm_state.apic_id); 1662 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1632,7 +1666,7 @@ static int lapic_resume(struct sys_device *dev)
1632 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1666 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1633 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1667 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1634 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1668 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1635#ifdef CONFIG_X86_MCE_P4THERMAL 1669#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1636 if (maxlvt >= 5) 1670 if (maxlvt >= 5)
1637 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1671 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1638#endif 1672#endif
@@ -1646,7 +1680,9 @@ static int lapic_resume(struct sys_device *dev)
1646 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1680 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1647 apic_write(APIC_ESR, 0); 1681 apic_write(APIC_ESR, 0);
1648 apic_read(APIC_ESR); 1682 apic_read(APIC_ESR);
1683
1649 local_irq_restore(flags); 1684 local_irq_restore(flags);
1685
1650 return 0; 1686 return 0;
1651} 1687}
1652 1688
@@ -1702,20 +1738,20 @@ static int __init parse_lapic(char *arg)
1702} 1738}
1703early_param("lapic", parse_lapic); 1739early_param("lapic", parse_lapic);
1704 1740
1705static int __init parse_nolapic(char *arg) 1741static int __init setup_disableapic(char *arg)
1706{ 1742{
1707 disable_apic = 1; 1743 disable_apic = 1;
1708 setup_clear_cpu_cap(X86_FEATURE_APIC); 1744 setup_clear_cpu_cap(X86_FEATURE_APIC);
1709 return 0; 1745 return 0;
1710} 1746}
1711early_param("nolapic", parse_nolapic); 1747early_param("disableapic", setup_disableapic);
1712 1748
1713static int __init parse_disable_lapic_timer(char *arg) 1749/* same as disableapic, for compatibility */
1750static int __init setup_nolapic(char *arg)
1714{ 1751{
1715 local_apic_timer_disabled = 1; 1752 return setup_disableapic(arg);
1716 return 0;
1717} 1753}
1718early_param("nolapic_timer", parse_disable_lapic_timer); 1754early_param("nolapic", setup_nolapic);
1719 1755
1720static int __init parse_lapic_timer_c2_ok(char *arg) 1756static int __init parse_lapic_timer_c2_ok(char *arg)
1721{ 1757{
@@ -1724,15 +1760,44 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1724} 1760}
1725early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1761early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1726 1762
1727static int __init apic_set_verbosity(char *str) 1763static int __init parse_disable_apic_timer(char *arg)
1764{
1765 disable_apic_timer = 1;
1766 return 0;
1767}
1768early_param("noapictimer", parse_disable_apic_timer);
1769
1770static int __init parse_nolapic_timer(char *arg)
1728{ 1771{
1729 if (strcmp("debug", str) == 0) 1772 disable_apic_timer = 1;
1773 return 0;
1774}
1775early_param("nolapic_timer", parse_nolapic_timer);
1776
1777static int __init apic_set_verbosity(char *arg)
1778{
1779 if (!arg) {
1780#ifdef CONFIG_X86_64
1781 skip_ioapic_setup = 0;
1782 ioapic_force = 1;
1783 return 0;
1784#endif
1785 return -EINVAL;
1786 }
1787
1788 if (strcmp("debug", arg) == 0)
1730 apic_verbosity = APIC_DEBUG; 1789 apic_verbosity = APIC_DEBUG;
1731 else if (strcmp("verbose", str) == 0) 1790 else if (strcmp("verbose", arg) == 0)
1732 apic_verbosity = APIC_VERBOSE; 1791 apic_verbosity = APIC_VERBOSE;
1733 return 1; 1792 else {
1793 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1794 " use apic=verbose or apic=debug\n", arg);
1795 return -EINVAL;
1796 }
1797
1798 return 0;
1734} 1799}
1735__setup("apic=", apic_set_verbosity); 1800early_param("apic", apic_set_verbosity);
1736 1801
1737static int __init lapic_insert_resource(void) 1802static int __init lapic_insert_resource(void)
1738{ 1803{
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index e571351f2a93..94ddb69ae15e 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -27,6 +27,7 @@
27#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/dmar.h>
30 31
31#include <asm/atomic.h> 32#include <asm/atomic.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
@@ -39,13 +40,20 @@
39#include <asm/proto.h> 40#include <asm/proto.h>
40#include <asm/timex.h> 41#include <asm/timex.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/i8259.h>
42 44
43#include <mach_ipi.h> 45#include <mach_ipi.h>
44#include <mach_apic.h> 46#include <mach_apic.h>
45 47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
46static int disable_apic_timer __cpuinitdata; 49static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
48int disable_apic; 51int disable_apic;
52int disable_x2apic;
53int x2apic;
54
55/* x2apic enabled before OS handover */
56int x2apic_preenabled;
49 57
50/* Local APIC timer works in C2 */ 58/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok; 59int local_apic_timer_c2_ok;
@@ -73,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
73static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void); 82static void apic_pm_activate(void);
75 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
76static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
77 .name = "lapic", 88 .name = "lapic",
78 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -90,7 +101,6 @@ static unsigned long apic_phys;
90 101
91unsigned long mp_lapic_addr; 102unsigned long mp_lapic_addr;
92 103
93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/* 104/*
95 * Get the LAPIC version 105 * Get the LAPIC version
96 */ 106 */
@@ -100,11 +110,15 @@ static inline int lapic_get_version(void)
100} 110}
101 111
102/* 112/*
103 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
104 */ 114 */
105static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
106{ 116{
117#ifdef CONFIG_X86_64
107 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
108} 122}
109 123
110/* 124/*
@@ -119,13 +133,18 @@ static int modern_apic(void)
119 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
120} 134}
121 135
122void apic_wait_icr_idle(void) 136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
141void xapic_wait_icr_idle(void)
123{ 142{
124 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
125 cpu_relax(); 144 cpu_relax();
126} 145}
127 146
128u32 safe_apic_wait_icr_idle(void) 147u32 safe_xapic_wait_icr_idle(void)
129{ 148{
130 u32 send_status; 149 u32 send_status;
131 int timeout; 150 int timeout;
@@ -141,6 +160,68 @@ u32 safe_apic_wait_icr_idle(void)
141 return send_status; 160 return send_status;
142} 161}
143 162
163void xapic_icr_write(u32 low, u32 id)
164{
165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
166 apic_write(APIC_ICR, low);
167}
168
169u64 xapic_icr_read(void)
170{
171 u32 icr1, icr2;
172
173 icr2 = apic_read(APIC_ICR2);
174 icr1 = apic_read(APIC_ICR);
175
176 return icr1 | ((u64)icr2 << 32);
177}
178
179static struct apic_ops xapic_ops = {
180 .read = native_apic_mem_read,
181 .write = native_apic_mem_write,
182 .icr_read = xapic_icr_read,
183 .icr_write = xapic_icr_write,
184 .wait_icr_idle = xapic_wait_icr_idle,
185 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
186};
187
188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
189EXPORT_SYMBOL_GPL(apic_ops);
190
191static void x2apic_wait_icr_idle(void)
192{
193 /* no need to wait for icr idle in x2apic */
194 return;
195}
196
197static u32 safe_x2apic_wait_icr_idle(void)
198{
199 /* no need to wait for icr idle in x2apic */
200 return 0;
201}
202
203void x2apic_icr_write(u32 low, u32 id)
204{
205 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
206}
207
208u64 x2apic_icr_read(void)
209{
210 unsigned long val;
211
212 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
213 return val;
214}
215
216static struct apic_ops x2apic_ops = {
217 .read = native_apic_msr_read,
218 .write = native_apic_msr_write,
219 .icr_read = x2apic_icr_read,
220 .icr_write = x2apic_icr_write,
221 .wait_icr_idle = x2apic_wait_icr_idle,
222 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
223};
224
144/** 225/**
145 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 226 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
146 */ 227 */
@@ -150,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void)
150 231
151 /* unmask and set to NMI */ 232 /* unmask and set to NMI */
152 v = APIC_DM_NMI; 233 v = APIC_DM_NMI;
234
235 /* Level triggered for 82489DX (32bit mode) */
236 if (!lapic_is_integrated())
237 v |= APIC_LVT_LEVEL_TRIGGER;
238
153 apic_write(APIC_LVT0, v); 239 apic_write(APIC_LVT0, v);
154} 240}
155 241
@@ -158,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void)
158 */ 244 */
159int lapic_get_maxlvt(void) 245int lapic_get_maxlvt(void)
160{ 246{
161 unsigned int v, maxlvt; 247 unsigned int v;
162 248
163 v = apic_read(APIC_LVR); 249 v = apic_read(APIC_LVR);
164 maxlvt = GET_APIC_MAXLVT(v); 250 /*
165 return maxlvt; 251 * - we always have APIC integrated on 64bit mode
252 * - 82489DXs do not report # of LVT entries
253 */
254 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
166} 255}
167 256
168/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
169 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
170 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
171 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -175,7 +275,6 @@ int lapic_get_maxlvt(void)
175 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
176 * P5 APIC double write bug. 276 * P5 APIC double write bug.
177 */ 277 */
178
179static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
180{ 279{
181 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -183,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
183 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
184 if (!oneshot) 283 if (!oneshot)
185 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
186 if (!irqen) 288 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
188 290
@@ -192,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
192 * Divide PICLK by 16 294 * Divide PICLK by 16
193 */ 295 */
194 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
195 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
197 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
198 300
199 if (!oneshot) 301 if (!oneshot)
200 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
201} 303}
202 304
203/* 305/*
@@ -371,7 +473,7 @@ static int __init calibrate_APIC_clock(void)
371 lapic_clockevent.min_delta_ns = 473 lapic_clockevent.min_delta_ns =
372 clockevent_delta2ns(0xF, &lapic_clockevent); 474 clockevent_delta2ns(0xF, &lapic_clockevent);
373 475
374 calibration_result = result / HZ; 476 calibration_result = (result * APIC_DIVISOR) / HZ;
375 477
376 /* 478 /*
377 * Do a sanity check on the APIC calibration result 479 * Do a sanity check on the APIC calibration result
@@ -393,10 +495,10 @@ static int __init calibrate_APIC_clock(void)
393void __init setup_boot_APIC_clock(void) 495void __init setup_boot_APIC_clock(void)
394{ 496{
395 /* 497 /*
396 * The local apic timer can be disabled via the kernel commandline. 498 * The local apic timer can be disabled via the kernel
397 * Register the lapic timer as a dummy clock event source on SMP 499 * commandline or from the CPU detection code. Register the lapic
398 * systems, so the broadcast mechanism is used. On UP systems simply 500 * timer as a dummy clock event source on SMP systems, so the
399 * ignore it. 501 * broadcast mechanism is used. On UP systems simply ignore it.
400 */ 502 */
401 if (disable_apic_timer) { 503 if (disable_apic_timer) {
402 printk(KERN_INFO "Disabling APIC timer\n"); 504 printk(KERN_INFO "Disabling APIC timer\n");
@@ -408,7 +510,9 @@ void __init setup_boot_APIC_clock(void)
408 return; 510 return;
409 } 511 }
410 512
411 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 513 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
514 "calibrating APIC timer ...\n");
515
412 if (calibrate_APIC_clock()) { 516 if (calibrate_APIC_clock()) {
413 /* No broadcast on UP ! */ 517 /* No broadcast on UP ! */
414 if (num_possible_cpus() > 1) 518 if (num_possible_cpus() > 1)
@@ -427,6 +531,7 @@ void __init setup_boot_APIC_clock(void)
427 printk(KERN_WARNING "APIC timer registered as dummy," 531 printk(KERN_WARNING "APIC timer registered as dummy,"
428 " due to nmi_watchdog=%d!\n", nmi_watchdog); 532 " due to nmi_watchdog=%d!\n", nmi_watchdog);
429 533
534 /* Setup the lapic or request the broadcast */
430 setup_APIC_timer(); 535 setup_APIC_timer();
431} 536}
432 537
@@ -465,7 +570,11 @@ static void local_apic_timer_interrupt(void)
465 /* 570 /*
466 * the NMI deadlock-detector uses this. 571 * the NMI deadlock-detector uses this.
467 */ 572 */
573#ifdef CONFIG_X86_64
468 add_pda(apic_timer_irqs, 1); 574 add_pda(apic_timer_irqs, 1);
575#else
576 per_cpu(irq_stat, cpu).apic_timer_irqs++;
577#endif
469 578
470 evt->event_handler(evt); 579 evt->event_handler(evt);
471} 580}
@@ -496,6 +605,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
496 irq_enter(); 605 irq_enter();
497 local_apic_timer_interrupt(); 606 local_apic_timer_interrupt();
498 irq_exit(); 607 irq_exit();
608
499 set_irq_regs(old_regs); 609 set_irq_regs(old_regs);
500} 610}
501 611
@@ -549,6 +659,13 @@ void clear_local_APIC(void)
549 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 659 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
550 } 660 }
551 661
662 /* lets not touch this if we didn't frob it */
663#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
664 if (maxlvt >= 5) {
665 v = apic_read(APIC_LVTTHMR);
666 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
667 }
668#endif
552 /* 669 /*
553 * Clean APIC state for other OSs: 670 * Clean APIC state for other OSs:
554 */ 671 */
@@ -559,8 +676,14 @@ void clear_local_APIC(void)
559 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 676 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
560 if (maxlvt >= 4) 677 if (maxlvt >= 4)
561 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 678 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
562 apic_write(APIC_ESR, 0); 679
563 apic_read(APIC_ESR); 680 /* Integrated APIC (!82489DX) ? */
681 if (lapic_is_integrated()) {
682 if (maxlvt > 3)
683 /* Clear ESR due to Pentium errata 3AP and 11AP */
684 apic_write(APIC_ESR, 0);
685 apic_read(APIC_ESR);
686 }
564} 687}
565 688
566/** 689/**
@@ -579,8 +702,28 @@ void disable_local_APIC(void)
579 value = apic_read(APIC_SPIV); 702 value = apic_read(APIC_SPIV);
580 value &= ~APIC_SPIV_APIC_ENABLED; 703 value &= ~APIC_SPIV_APIC_ENABLED;
581 apic_write(APIC_SPIV, value); 704 apic_write(APIC_SPIV, value);
705
706#ifdef CONFIG_X86_32
707 /*
708 * When LAPIC was disabled by the BIOS and enabled by the kernel,
709 * restore the disabled state.
710 */
711 if (enabled_via_apicbase) {
712 unsigned int l, h;
713
714 rdmsr(MSR_IA32_APICBASE, l, h);
715 l &= ~MSR_IA32_APICBASE_ENABLE;
716 wrmsr(MSR_IA32_APICBASE, l, h);
717 }
718#endif
582} 719}
583 720
721/*
722 * If Linux enabled the LAPIC against the BIOS default disable it down before
723 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
724 * not power-off. Additionally clear all LVT entries before disable_local_APIC
725 * for the case where Linux didn't enable the LAPIC.
726 */
584void lapic_shutdown(void) 727void lapic_shutdown(void)
585{ 728{
586 unsigned long flags; 729 unsigned long flags;
@@ -590,7 +733,13 @@ void lapic_shutdown(void)
590 733
591 local_irq_save(flags); 734 local_irq_save(flags);
592 735
593 disable_local_APIC(); 736#ifdef CONFIG_X86_32
737 if (!enabled_via_apicbase)
738 clear_local_APIC();
739 else
740#endif
741 disable_local_APIC();
742
594 743
595 local_irq_restore(flags); 744 local_irq_restore(flags);
596} 745}
@@ -634,10 +783,10 @@ int __init verify_local_APIC(void)
634 /* 783 /*
635 * The ID register is read/write in a real APIC. 784 * The ID register is read/write in a real APIC.
636 */ 785 */
637 reg0 = read_apic_id(); 786 reg0 = apic_read(APIC_ID);
638 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 787 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
639 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 788 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
640 reg1 = read_apic_id(); 789 reg1 = apic_read(APIC_ID);
641 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 790 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
642 apic_write(APIC_ID, reg0); 791 apic_write(APIC_ID, reg0);
643 if (reg1 != (reg0 ^ APIC_ID_MASK)) 792 if (reg1 != (reg0 ^ APIC_ID_MASK))
@@ -661,8 +810,11 @@ int __init verify_local_APIC(void)
661 */ 810 */
662void __init sync_Arb_IDs(void) 811void __init sync_Arb_IDs(void)
663{ 812{
664 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 813 /*
665 if (modern_apic()) 814 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
815 * needed on AMD.
816 */
817 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
666 return; 818 return;
667 819
668 /* 820 /*
@@ -671,8 +823,8 @@ void __init sync_Arb_IDs(void)
671 apic_wait_icr_idle(); 823 apic_wait_icr_idle();
672 824
673 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 825 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
674 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 826 apic_write(APIC_ICR, APIC_DEST_ALLINC |
675 | APIC_DM_INIT); 827 APIC_INT_LEVELTRIG | APIC_DM_INIT);
676} 828}
677 829
678/* 830/*
@@ -689,8 +841,6 @@ void __init init_bsp_APIC(void)
689 if (smp_found_config || !cpu_has_apic) 841 if (smp_found_config || !cpu_has_apic)
690 return; 842 return;
691 843
692 value = apic_read(APIC_LVR);
693
694 /* 844 /*
695 * Do not trust the local APIC being empty at bootup. 845 * Do not trust the local APIC being empty at bootup.
696 */ 846 */
@@ -702,7 +852,15 @@ void __init init_bsp_APIC(void)
702 value = apic_read(APIC_SPIV); 852 value = apic_read(APIC_SPIV);
703 value &= ~APIC_VECTOR_MASK; 853 value &= ~APIC_VECTOR_MASK;
704 value |= APIC_SPIV_APIC_ENABLED; 854 value |= APIC_SPIV_APIC_ENABLED;
705 value |= APIC_SPIV_FOCUS_DISABLED; 855
856#ifdef CONFIG_X86_32
857 /* This bit is reserved on P4/Xeon and should be cleared */
858 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
859 (boot_cpu_data.x86 == 15))
860 value &= ~APIC_SPIV_FOCUS_DISABLED;
861 else
862#endif
863 value |= APIC_SPIV_FOCUS_DISABLED;
706 value |= SPURIOUS_APIC_VECTOR; 864 value |= SPURIOUS_APIC_VECTOR;
707 apic_write(APIC_SPIV, value); 865 apic_write(APIC_SPIV, value);
708 866
@@ -711,9 +869,50 @@ void __init init_bsp_APIC(void)
711 */ 869 */
712 apic_write(APIC_LVT0, APIC_DM_EXTINT); 870 apic_write(APIC_LVT0, APIC_DM_EXTINT);
713 value = APIC_DM_NMI; 871 value = APIC_DM_NMI;
872 if (!lapic_is_integrated()) /* 82489DX */
873 value |= APIC_LVT_LEVEL_TRIGGER;
714 apic_write(APIC_LVT1, value); 874 apic_write(APIC_LVT1, value);
715} 875}
716 876
877static void __cpuinit lapic_setup_esr(void)
878{
879 unsigned long oldvalue, value, maxlvt;
880 if (lapic_is_integrated() && !esr_disable) {
881 if (esr_disable) {
882 /*
883 * Something untraceable is creating bad interrupts on
884 * secondary quads ... for the moment, just leave the
885 * ESR disabled - we can't do anything useful with the
886 * errors anyway - mbligh
887 */
888 printk(KERN_INFO "Leaving ESR disabled.\n");
889 return;
890 }
891 /* !82489DX */
892 maxlvt = lapic_get_maxlvt();
893 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
894 apic_write(APIC_ESR, 0);
895 oldvalue = apic_read(APIC_ESR);
896
897 /* enables sending errors */
898 value = ERROR_APIC_VECTOR;
899 apic_write(APIC_LVTERR, value);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905 value = apic_read(APIC_ESR);
906 if (value != oldvalue)
907 apic_printk(APIC_VERBOSE, "ESR value before enabling "
908 "vector: 0x%08lx after: 0x%08lx\n",
909 oldvalue, value);
910 } else {
911 printk(KERN_INFO "No ESR for 82489DX.\n");
912 }
913}
914
915
717/** 916/**
718 * setup_local_APIC - setup the local APIC 917 * setup_local_APIC - setup the local APIC
719 */ 918 */
@@ -819,25 +1018,143 @@ void __cpuinit setup_local_APIC(void)
819 preempt_enable(); 1018 preempt_enable();
820} 1019}
821 1020
822static void __cpuinit lapic_setup_esr(void)
823{
824 unsigned maxlvt = lapic_get_maxlvt();
825
826 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
827 /*
828 * spec says clear errors after enabling vector.
829 */
830 if (maxlvt > 3)
831 apic_write(APIC_ESR, 0);
832}
833
834void __cpuinit end_local_APIC_setup(void) 1021void __cpuinit end_local_APIC_setup(void)
835{ 1022{
836 lapic_setup_esr(); 1023 lapic_setup_esr();
1024
1025#ifdef CONFIG_X86_32
1026 {
1027 unsigned int value;
1028 /* Disable the local apic timer */
1029 value = apic_read(APIC_LVTT);
1030 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1031 apic_write(APIC_LVTT, value);
1032 }
1033#endif
1034
837 setup_apic_nmi_watchdog(NULL); 1035 setup_apic_nmi_watchdog(NULL);
838 apic_pm_activate(); 1036 apic_pm_activate();
839} 1037}
840 1038
1039void check_x2apic(void)
1040{
1041 int msr, msr2;
1042
1043 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1044
1045 if (msr & X2APIC_ENABLE) {
1046 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1047 x2apic_preenabled = x2apic = 1;
1048 apic_ops = &x2apic_ops;
1049 }
1050}
1051
1052void enable_x2apic(void)
1053{
1054 int msr, msr2;
1055
1056 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1057 if (!(msr & X2APIC_ENABLE)) {
1058 printk("Enabling x2apic\n");
1059 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1060 }
1061}
1062
1063void enable_IR_x2apic(void)
1064{
1065#ifdef CONFIG_INTR_REMAP
1066 int ret;
1067 unsigned long flags;
1068
1069 if (!cpu_has_x2apic)
1070 return;
1071
1072 if (!x2apic_preenabled && disable_x2apic) {
1073 printk(KERN_INFO
1074 "Skipped enabling x2apic and Interrupt-remapping "
1075 "because of nox2apic\n");
1076 return;
1077 }
1078
1079 if (x2apic_preenabled && disable_x2apic)
1080 panic("Bios already enabled x2apic, can't enforce nox2apic");
1081
1082 if (!x2apic_preenabled && skip_ioapic_setup) {
1083 printk(KERN_INFO
1084 "Skipped enabling x2apic and Interrupt-remapping "
1085 "because of skipping io-apic setup\n");
1086 return;
1087 }
1088
1089 ret = dmar_table_init();
1090 if (ret) {
1091 printk(KERN_INFO
1092 "dmar_table_init() failed with %d:\n", ret);
1093
1094 if (x2apic_preenabled)
1095 panic("x2apic enabled by bios. But IR enabling failed");
1096 else
1097 printk(KERN_INFO
1098 "Not enabling x2apic,Intr-remapping\n");
1099 return;
1100 }
1101
1102 local_irq_save(flags);
1103 mask_8259A();
1104 save_mask_IO_APIC_setup();
1105
1106 ret = enable_intr_remapping(1);
1107
1108 if (ret && x2apic_preenabled) {
1109 local_irq_restore(flags);
1110 panic("x2apic enabled by bios. But IR enabling failed");
1111 }
1112
1113 if (ret)
1114 goto end;
1115
1116 if (!x2apic) {
1117 x2apic = 1;
1118 apic_ops = &x2apic_ops;
1119 enable_x2apic();
1120 }
1121end:
1122 if (ret)
1123 /*
1124 * IR enabling failed
1125 */
1126 restore_IO_APIC_setup();
1127 else
1128 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1129
1130 unmask_8259A();
1131 local_irq_restore(flags);
1132
1133 if (!ret) {
1134 if (!x2apic_preenabled)
1135 printk(KERN_INFO
1136 "Enabled x2apic and interrupt-remapping\n");
1137 else
1138 printk(KERN_INFO
1139 "Enabled Interrupt-remapping\n");
1140 } else
1141 printk(KERN_ERR
1142 "Failed to enable Interrupt-remapping and x2apic\n");
1143#else
1144 if (!cpu_has_x2apic)
1145 return;
1146
1147 if (x2apic_preenabled)
1148 panic("x2apic enabled prior OS handover,"
1149 " enable CONFIG_INTR_REMAP");
1150
1151 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1152 " and x2apic\n");
1153#endif
1154
1155 return;
1156}
1157
841/* 1158/*
842 * Detect and enable local APICs on non-SMP boards. 1159 * Detect and enable local APICs on non-SMP boards.
843 * Original code written by Keir Fraser. 1160 * Original code written by Keir Fraser.
@@ -877,7 +1194,7 @@ void __init early_init_lapic_mapping(void)
877 * Fetch the APIC ID of the BSP in case we have a 1194 * Fetch the APIC ID of the BSP in case we have a
878 * default configuration (or the MP table is broken). 1195 * default configuration (or the MP table is broken).
879 */ 1196 */
880 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1197 boot_cpu_physical_apicid = read_apic_id();
881} 1198}
882 1199
883/** 1200/**
@@ -885,6 +1202,11 @@ void __init early_init_lapic_mapping(void)
885 */ 1202 */
886void __init init_apic_mappings(void) 1203void __init init_apic_mappings(void)
887{ 1204{
1205 if (x2apic) {
1206 boot_cpu_physical_apicid = read_apic_id();
1207 return;
1208 }
1209
888 /* 1210 /*
889 * If no local APIC can be found then set up a fake all 1211 * If no local APIC can be found then set up a fake all
890 * zeroes page to simulate the local APIC and another 1212 * zeroes page to simulate the local APIC and another
@@ -904,13 +1226,15 @@ void __init init_apic_mappings(void)
904 * Fetch the APIC ID of the BSP in case we have a 1226 * Fetch the APIC ID of the BSP in case we have a
905 * default configuration (or the MP table is broken). 1227 * default configuration (or the MP table is broken).
906 */ 1228 */
907 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1229 boot_cpu_physical_apicid = read_apic_id();
908} 1230}
909 1231
910/* 1232/*
911 * This initializes the IO-APIC and APIC hardware if this is 1233 * This initializes the IO-APIC and APIC hardware if this is
912 * a UP kernel. 1234 * a UP kernel.
913 */ 1235 */
1236int apic_version[MAX_APICS];
1237
914int __init APIC_init_uniprocessor(void) 1238int __init APIC_init_uniprocessor(void)
915{ 1239{
916 if (disable_apic) { 1240 if (disable_apic) {
@@ -923,6 +1247,9 @@ int __init APIC_init_uniprocessor(void)
923 return -1; 1247 return -1;
924 } 1248 }
925 1249
1250 enable_IR_x2apic();
1251 setup_apic_routing();
1252
926 verify_local_APIC(); 1253 verify_local_APIC();
927 1254
928 connect_bsp_APIC(); 1255 connect_bsp_APIC();
@@ -1009,17 +1336,57 @@ asmlinkage void smp_error_interrupt(void)
1009} 1336}
1010 1337
1011/** 1338/**
1012 * * connect_bsp_APIC - attach the APIC to the interrupt system 1339 * connect_bsp_APIC - attach the APIC to the interrupt system
1013 * */ 1340 */
1014void __init connect_bsp_APIC(void) 1341void __init connect_bsp_APIC(void)
1015{ 1342{
1343#ifdef CONFIG_X86_32
1344 if (pic_mode) {
1345 /*
1346 * Do not trust the local APIC being empty at bootup.
1347 */
1348 clear_local_APIC();
1349 /*
1350 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1351 * local APIC to INT and NMI lines.
1352 */
1353 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1354 "enabling APIC mode.\n");
1355 outb(0x70, 0x22);
1356 outb(0x01, 0x23);
1357 }
1358#endif
1016 enable_apic_mode(); 1359 enable_apic_mode();
1017} 1360}
1018 1361
1362/**
1363 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1364 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1365 *
1366 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1367 * APIC is disabled.
1368 */
1019void disconnect_bsp_APIC(int virt_wire_setup) 1369void disconnect_bsp_APIC(int virt_wire_setup)
1020{ 1370{
1371 unsigned int value;
1372
1373#ifdef CONFIG_X86_32
1374 if (pic_mode) {
1375 /*
1376 * Put the board back into PIC mode (has an effect only on
1377 * certain older boards). Note that APIC interrupts, including
1378 * IPIs, won't work beyond this point! The only exception are
1379 * INIT IPIs.
1380 */
1381 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1382 "entering PIC mode.\n");
1383 outb(0x70, 0x22);
1384 outb(0x00, 0x23);
1385 return;
1386 }
1387#endif
1388
1021 /* Go back to Virtual Wire compatibility mode */ 1389 /* Go back to Virtual Wire compatibility mode */
1022 unsigned long value;
1023 1390
1024 /* For the spurious interrupt use vector F, and enable it */ 1391 /* For the spurious interrupt use vector F, and enable it */
1025 value = apic_read(APIC_SPIV); 1392 value = apic_read(APIC_SPIV);
@@ -1045,7 +1412,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1045 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1412 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1046 } 1413 }
1047 1414
1048 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1415 /*
1416 * For LVT1 make it edge triggered, active high,
1417 * nmi and enabled
1418 */
1049 value = apic_read(APIC_LVT1); 1419 value = apic_read(APIC_LVT1);
1050 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1420 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1051 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1421 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1060,15 +1430,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1060 int cpu; 1430 int cpu;
1061 cpumask_t tmp_map; 1431 cpumask_t tmp_map;
1062 1432
1063 if (num_processors >= NR_CPUS) { 1433 /*
1064 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1434 * Validate version
1065 " Processor ignored.\n", NR_CPUS); 1435 */
1066 return; 1436 if (version == 0x0) {
1437 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1438 "fixing up to 0x10. (tell your hw vendor)\n",
1439 version);
1440 version = 0x10;
1067 } 1441 }
1442 apic_version[apicid] = version;
1068 1443
1069 if (num_processors >= maxcpus) { 1444 if (num_processors >= NR_CPUS) {
1070 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." 1445 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1071 " Processor ignored.\n", maxcpus); 1446 " Processor ignored.\n", NR_CPUS);
1072 return; 1447 return;
1073 } 1448 }
1074 1449
@@ -1088,6 +1463,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1088 if (apicid > max_physical_apicid) 1463 if (apicid > max_physical_apicid)
1089 max_physical_apicid = apicid; 1464 max_physical_apicid = apicid;
1090 1465
1466#ifdef CONFIG_X86_32
1467 /*
1468 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1469 * but we need to work other dependencies like SMP_SUSPEND etc
1470 * before this can be done without some confusion.
1471 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1472 * - Ashok Raj <ashok.raj@intel.com>
1473 */
1474 if (max_physical_apicid >= 8) {
1475 switch (boot_cpu_data.x86_vendor) {
1476 case X86_VENDOR_INTEL:
1477 if (!APIC_XAPIC(version)) {
1478 def_to_bigsmp = 0;
1479 break;
1480 }
1481 /* If P4 and above fall through */
1482 case X86_VENDOR_AMD:
1483 def_to_bigsmp = 1;
1484 }
1485 }
1486#endif
1487
1488#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1091 /* are we being called early in kernel startup? */ 1489 /* are we being called early in kernel startup? */
1092 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1490 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1093 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1491 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1099,20 +1497,28 @@ void __cpuinit generic_processor_info(int apicid, int version)
1099 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1497 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1100 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1498 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1101 } 1499 }
1500#endif
1102 1501
1103 cpu_set(cpu, cpu_possible_map); 1502 cpu_set(cpu, cpu_possible_map);
1104 cpu_set(cpu, cpu_present_map); 1503 cpu_set(cpu, cpu_present_map);
1105} 1504}
1106 1505
1506int hard_smp_processor_id(void)
1507{
1508 return read_apic_id();
1509}
1510
1107/* 1511/*
1108 * Power management 1512 * Power management
1109 */ 1513 */
1110#ifdef CONFIG_PM 1514#ifdef CONFIG_PM
1111 1515
1112static struct { 1516static struct {
1113 /* 'active' is true if the local APIC was enabled by us and 1517 /*
1114 not the BIOS; this signifies that we are also responsible 1518 * 'active' is true if the local APIC was enabled by us and
1115 for disabling it before entering apm/acpi suspend */ 1519 * not the BIOS; this signifies that we are also responsible
1520 * for disabling it before entering apm/acpi suspend
1521 */
1116 int active; 1522 int active;
1117 /* r/w apic fields */ 1523 /* r/w apic fields */
1118 unsigned int apic_id; 1524 unsigned int apic_id;
@@ -1140,7 +1546,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1140 1546
1141 maxlvt = lapic_get_maxlvt(); 1547 maxlvt = lapic_get_maxlvt();
1142 1548
1143 apic_pm_state.apic_id = read_apic_id(); 1549 apic_pm_state.apic_id = apic_read(APIC_ID);
1144 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1550 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1145 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 1551 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1146 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 1552 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
@@ -1153,10 +1559,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1153 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1559 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1154 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1560 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1155 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1561 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1156#ifdef CONFIG_X86_MCE_INTEL 1562#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1157 if (maxlvt >= 5) 1563 if (maxlvt >= 5)
1158 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1564 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1159#endif 1565#endif
1566
1160 local_irq_save(flags); 1567 local_irq_save(flags);
1161 disable_local_APIC(); 1568 disable_local_APIC();
1162 local_irq_restore(flags); 1569 local_irq_restore(flags);
@@ -1175,10 +1582,25 @@ static int lapic_resume(struct sys_device *dev)
1175 maxlvt = lapic_get_maxlvt(); 1582 maxlvt = lapic_get_maxlvt();
1176 1583
1177 local_irq_save(flags); 1584 local_irq_save(flags);
1178 rdmsr(MSR_IA32_APICBASE, l, h); 1585
1179 l &= ~MSR_IA32_APICBASE_BASE; 1586#ifdef CONFIG_X86_64
1180 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1587 if (x2apic)
1181 wrmsr(MSR_IA32_APICBASE, l, h); 1588 enable_x2apic();
1589 else
1590#endif
1591 {
1592 /*
1593 * Make sure the APICBASE points to the right address
1594 *
1595 * FIXME! This will be wrong if we ever support suspend on
1596 * SMP! We'll need to do this as part of the CPU restore!
1597 */
1598 rdmsr(MSR_IA32_APICBASE, l, h);
1599 l &= ~MSR_IA32_APICBASE_BASE;
1600 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1601 wrmsr(MSR_IA32_APICBASE, l, h);
1602 }
1603
1182 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1604 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1183 apic_write(APIC_ID, apic_pm_state.apic_id); 1605 apic_write(APIC_ID, apic_pm_state.apic_id);
1184 apic_write(APIC_DFR, apic_pm_state.apic_dfr); 1606 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -1187,7 +1609,7 @@ static int lapic_resume(struct sys_device *dev)
1187 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1609 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1188 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1610 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1189 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1611 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1190#ifdef CONFIG_X86_MCE_INTEL 1612#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1191 if (maxlvt >= 5) 1613 if (maxlvt >= 5)
1192 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1614 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1193#endif 1615#endif
@@ -1201,10 +1623,17 @@ static int lapic_resume(struct sys_device *dev)
1201 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1623 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1202 apic_write(APIC_ESR, 0); 1624 apic_write(APIC_ESR, 0);
1203 apic_read(APIC_ESR); 1625 apic_read(APIC_ESR);
1626
1204 local_irq_restore(flags); 1627 local_irq_restore(flags);
1628
1205 return 0; 1629 return 0;
1206} 1630}
1207 1631
1632/*
1633 * This device has no shutdown method - fully functioning local APICs
1634 * are needed on every CPU up until machine_halt/restart/poweroff.
1635 */
1636
1208static struct sysdev_class lapic_sysclass = { 1637static struct sysdev_class lapic_sysclass = {
1209 .name = "lapic", 1638 .name = "lapic",
1210 .resume = lapic_resume, 1639 .resume = lapic_resume,
@@ -1318,31 +1747,19 @@ __cpuinit int apic_is_clustered_box(void)
1318 return (clusters > 2); 1747 return (clusters > 2);
1319} 1748}
1320 1749
1321/* 1750static __init int setup_nox2apic(char *str)
1322 * APIC command line parameters
1323 */
1324static int __init apic_set_verbosity(char *str)
1325{ 1751{
1326 if (str == NULL) { 1752 disable_x2apic = 1;
1327 skip_ioapic_setup = 0; 1753 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
1328 ioapic_force = 1;
1329 return 0;
1330 }
1331 if (strcmp("debug", str) == 0)
1332 apic_verbosity = APIC_DEBUG;
1333 else if (strcmp("verbose", str) == 0)
1334 apic_verbosity = APIC_VERBOSE;
1335 else {
1336 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1337 " use apic=verbose or apic=debug\n", str);
1338 return -EINVAL;
1339 }
1340
1341 return 0; 1754 return 0;
1342} 1755}
1343early_param("apic", apic_set_verbosity); 1756early_param("nox2apic", setup_nox2apic);
1344 1757
1345static __init int setup_disableapic(char *str) 1758
1759/*
1760 * APIC command line parameters
1761 */
1762static int __init setup_disableapic(char *arg)
1346{ 1763{
1347 disable_apic = 1; 1764 disable_apic = 1;
1348 setup_clear_cpu_cap(X86_FEATURE_APIC); 1765 setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1351,9 +1768,9 @@ static __init int setup_disableapic(char *str)
1351early_param("disableapic", setup_disableapic); 1768early_param("disableapic", setup_disableapic);
1352 1769
1353/* same as disableapic, for compatibility */ 1770/* same as disableapic, for compatibility */
1354static __init int setup_nolapic(char *str) 1771static int __init setup_nolapic(char *arg)
1355{ 1772{
1356 return setup_disableapic(str); 1773 return setup_disableapic(arg);
1357} 1774}
1358early_param("nolapic", setup_nolapic); 1775early_param("nolapic", setup_nolapic);
1359 1776
@@ -1364,14 +1781,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1364} 1781}
1365early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1782early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1366 1783
1367static __init int setup_noapictimer(char *str) 1784static int __init parse_disable_apic_timer(char *arg)
1368{ 1785{
1369 if (str[0] != ' ' && str[0] != 0)
1370 return 0;
1371 disable_apic_timer = 1; 1786 disable_apic_timer = 1;
1372 return 1; 1787 return 0;
1788}
1789early_param("noapictimer", parse_disable_apic_timer);
1790
1791static int __init parse_nolapic_timer(char *arg)
1792{
1793 disable_apic_timer = 1;
1794 return 0;
1373} 1795}
1374__setup("noapictimer", setup_noapictimer); 1796early_param("nolapic_timer", parse_nolapic_timer);
1375 1797
1376static __init int setup_apicpmtimer(char *s) 1798static __init int setup_apicpmtimer(char *s)
1377{ 1799{
@@ -1381,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s)
1381} 1803}
1382__setup("apicpmtimer", setup_apicpmtimer); 1804__setup("apicpmtimer", setup_apicpmtimer);
1383 1805
1806static int __init apic_set_verbosity(char *arg)
1807{
1808 if (!arg) {
1809#ifdef CONFIG_X86_64
1810 skip_ioapic_setup = 0;
1811 ioapic_force = 1;
1812 return 0;
1813#endif
1814 return -EINVAL;
1815 }
1816
1817 if (strcmp("debug", arg) == 0)
1818 apic_verbosity = APIC_DEBUG;
1819 else if (strcmp("verbose", arg) == 0)
1820 apic_verbosity = APIC_VERBOSE;
1821 else {
1822 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1823 " use apic=verbose or apic=debug\n", arg);
1824 return -EINVAL;
1825 }
1826
1827 return 0;
1828}
1829early_param("apic", apic_set_verbosity);
1830
1384static int __init lapic_insert_resource(void) 1831static int __init lapic_insert_resource(void)
1385{ 1832{
1386 if (!apic_phys) 1833 if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b0..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
232 231
233#include <asm/system.h> 232#include <asm/system.h>
234#include <asm/uaccess.h> 233#include <asm/uaccess.h>
235#include <asm/desc.h> 234#include <asm/desc.h>
236#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
237#include <asm/paravirt.h> 237#include <asm/paravirt.h>
238#include <asm/reboot.h> 238#include <asm/reboot.h>
239 239
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
2217 2217
2218 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2219 2219
2220 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2221 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2222 return -ENODEV; 2222 return -ENODEV;
2223 } 2223 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index aa89387006fe..505543a75a56 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
22 22
23#define __NO_STUBS 1 23#define __NO_STUBS 1
24#undef __SYSCALL 24#undef __SYSCALL
25#undef _ASM_X86_64_UNISTD_H_ 25#undef ASM_X86__UNISTD_64_H
26#define __SYSCALL(nr, sym) [nr] = 1, 26#define __SYSCALL(nr, sym) [nr] = 1,
27static char syscalls[] = { 27static char syscalls[] = {
28#include <asm/unistd.h> 28#include <asm/unistd.h>
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391c..fdd585f9c53d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
25{ 25{
26 const char *str; 26 const char *str;
27 switch (status) { 27 switch (status) {
28 case 0: str = "Call completed without error"; break; 28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break; 29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break; 30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break; 31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break; 32 default: str = "Unknown BIOS status code"; break;
33 } 33 }
34 return str; 34 return str;
35} 35}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad3001..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += amd.o 10
11obj-$(CONFIG_X86_64) += amd_64.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_64) += centaur_64.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_X86_64) += intel_64.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
18obj-$(CONFIG_X86_32) += umc.o
19 18
20obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
21obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
22obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
23 22
24obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
@@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
56 144
57 switch (c->x86_vendor) { 145 switch (c->x86_vendor) {
58 case X86_VENDOR_INTEL: 146 case X86_VENDOR_INTEL:
59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 147 /*
148 * There is a known erratum on Pentium III and Core Solo
149 * and Core Duo CPUs.
150 * " Page with PAT set to WC while associated MTRR is UC
151 * may consolidate to UC "
152 * Because of this erratum, it is better to stick with
153 * setting WC in MTRR rather than using PAT on these CPUs.
154 *
155 * Enable PAT WC only on P4, Core 2 or later CPUs.
156 */
157 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
60 return; 158 return;
61 break; 159
160 pat_disable("PAT WC disabled due to known CPU erratum.");
161 return;
162
62 case X86_VENDOR_AMD: 163 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR: 164 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA: 165 case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index cae9cabc3031..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,21 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34} 51}
35 52
36static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
37{ 55{
38 u32 l, h; 56 u32 l, h;
39 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
40 int r;
41 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
42#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
43 unsigned long long value; 304 unsigned long long value;
44 305
@@ -49,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
49 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
50 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
51 */ 312 */
52 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
53 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
54 value |= 1 << 6; 315 value |= 1 << 6;
55 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -59,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
59 early_init_amd(c); 320 early_init_amd(c);
60 321
61 /* 322 /*
62 * FIXME: We should handle the K5 here. Set up the write
63 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
64 * no bus pipeline)
65 */
66
67 /*
68 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
69 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
70 */ 325 */
71 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
72 327
73 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
74 332
75 switch (c->x86) { 333 level = cpuid_eax(1);
76 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
77 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
78 * General Systems BIOSen alias the cpu frequency registers
79 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
80 * drivers subsequently pokes it, and changes the CPU speed.
81 * Workaround : Remove the unneeded alias.
82 */
83#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
84#define CBAR_ENB (0x80000000)
85#define CBAR_KEY (0X000000CB)
86 if (c->x86_model == 9 || c->x86_model == 10) {
87 if (inl (CBAR) & CBAR_ENB)
88 outl (0 | CBAR_KEY, CBAR);
89 }
90 break;
91 case 5:
92 if (c->x86_model < 6) {
93 /* Based on AMD doc 20734R - June 2000 */
94 if (c->x86_model == 0) {
95 clear_cpu_cap(c, X86_FEATURE_APIC);
96 set_cpu_cap(c, X86_FEATURE_PGE);
97 }
98 break;
99 }
100
101 if (c->x86_model == 6 && c->x86_mask == 1) {
102 const int K6_BUG_LOOP = 1000000;
103 int n;
104 void (*f_vide)(void);
105 unsigned long d, d2;
106
107 printk(KERN_INFO "AMD K6 stepping B detected - ");
108
109 /*
110 * It looks like AMD fixed the 2.6.2 bug and improved indirect
111 * calls at the same time.
112 */
113
114 n = K6_BUG_LOOP;
115 f_vide = vide;
116 rdtscl(d);
117 while (n--)
118 f_vide();
119 rdtscl(d2);
120 d = d2-d;
121
122 if (d > 20*K6_BUG_LOOP)
123 printk("system stability may be impaired when more than 32 MB are used.\n");
124 else
125 printk("probably OK (after B9730xxxx).\n");
126 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
127 }
128
129 /* K6 with old style WHCR */
130 if (c->x86_model < 8 ||
131 (c->x86_model == 8 && c->x86_mask < 8)) {
132 /* We can only write allocate on the low 508Mb */
133 if (mbytes > 508)
134 mbytes = 508;
135
136 rdmsr(MSR_K6_WHCR, l, h);
137 if ((l&0x0000FFFF) == 0) {
138 unsigned long flags;
139 l = (1<<0)|((mbytes/4)<<1);
140 local_irq_save(flags);
141 wbinvd();
142 wrmsr(MSR_K6_WHCR, l, h);
143 local_irq_restore(flags);
144 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
145 mbytes);
146 }
147 break;
148 }
149
150 if ((c->x86_model == 8 && c->x86_mask > 7) ||
151 c->x86_model == 9 || c->x86_model == 13) {
152 /* The more serious chips .. */
153
154 if (mbytes > 4092)
155 mbytes = 4092;
156
157 rdmsr(MSR_K6_WHCR, l, h);
158 if ((l&0xFFFF0000) == 0) {
159 unsigned long flags;
160 l = ((mbytes>>2)<<22)|(1<<16);
161 local_irq_save(flags);
162 wbinvd();
163 wrmsr(MSR_K6_WHCR, l, h);
164 local_irq_restore(flags);
165 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
166 mbytes);
167 }
168
169 /* Set MTRR capability flag if appropriate */
170 if (c->x86_model == 13 || c->x86_model == 9 ||
171 (c->x86_model == 8 && c->x86_mask >= 8))
172 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
173 break;
174 }
175
176 if (c->x86_model == 10) {
177 /* AMD Geode LX is model 10 */
178 /* placeholder for any needed mods */
179 break;
180 }
181 break;
182 case 6: /* An Athlon/Duron */
183
184 /*
185 * Bit 15 of Athlon specific MSR 15, needs to be 0
186 * to enable SSE on Palomino/Morgan/Barton CPU's.
187 * If the BIOS didn't enable it already, enable it here.
188 */
189 if (c->x86_model >= 6 && c->x86_model <= 10) {
190 if (!cpu_has(c, X86_FEATURE_XMM)) {
191 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
192 rdmsr(MSR_K7_HWCR, l, h);
193 l &= ~0x00008000;
194 wrmsr(MSR_K7_HWCR, l, h);
195 set_cpu_cap(c, X86_FEATURE_XMM);
196 }
197 }
198
199 /*
200 * It's been determined by AMD that Athlons since model 8 stepping 1
201 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
202 * As per AMD technical note 27212 0.2
203 */
204 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
205 rdmsr(MSR_K7_CLK_CTL, l, h);
206 if ((l & 0xfff00000) != 0x20000000) {
207 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
208 ((l & 0x000fffff)|0x20000000));
209 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
210 }
211 }
212 break;
213 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
214 346
215 switch (c->x86) { 347 switch (c->x86) {
216 case 15: 348 case 4:
217 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
218 case 0x10:
219 case 0x11:
220 set_cpu_cap(c, X86_FEATURE_K8);
221 break; 350 break;
222 case 6: 351 case 5:
223 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
224 break; 356 break;
225 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
226 if (c->x86 >= 6) 365 if (c->x86 >= 6)
227 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
228 367
229 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
230 369 switch (c->x86) {
231 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
232 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
233 377
234#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
235 /*
236 * On a AMD multi core setup the lower bits of the APIC id
237 * distinguish the cores.
238 */
239 if (c->x86_max_cores > 1) {
240 int cpu = smp_processor_id();
241 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
242 379
243 if (bits == 0) { 380 /* Multi core CPU? */
244 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
245 bits++; 382 amd_detect_cmp(c);
246 } 383 srat_detect_node(c);
247 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
248 c->phys_proc_id >>= bits;
249 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
250 cpu, c->x86_max_cores, c->cpu_core_id);
251 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
252#endif 388#endif
253 389
254 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
255 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
256 num_cache_leaves = 4; 392 num_cache_leaves = 4;
257 else 393 else
258 num_cache_leaves = 3; 394 num_cache_leaves = 3;
259 } 395 }
260 396
261 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
262 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
263 clear_cpu_cap(c, X86_FEATURE_MCE);
264 399
265 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
266 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
267} 433}
268 434
435#ifdef CONFIG_X86_32
269static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
270{ 437{
271 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -278,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
278 } 445 }
279 return size; 446 return size;
280} 447}
448#endif
281 449
282static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
283 .c_vendor = "AMD", 451 .c_vendor = "AMD",
284 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
285 .c_models = { 454 .c_models = {
286 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
287 { 456 {
@@ -294,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
294 } 463 }
295 }, 464 },
296 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
297 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
298 .c_init = init_amd, 469 .c_init = init_amd,
299 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
300}; 471};
301 472
302cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2a41ff..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221};
222
223cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
224
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c9b58a806e85..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
74 "fistpl %0\n\t" 76 "fistpl %0\n\t"
75 "fwait\n\t" 77 "fwait\n\t"
76 "fninit" 78 "fninit"
77 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
78 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
79 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
80 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
81} 85}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -314,6 +313,16 @@ enum {
314 EAMD3D = 1<<20, 313 EAMD3D = 1<<20,
315}; 314};
316 315
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{
318 switch (c->x86) {
319 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break;
323 }
324}
325
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 326static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 327{
319 328
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 471static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 472 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 473 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 475 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
467}; 478};
468 479
469cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..fb789dd9e691 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,27 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
16#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 24#include <asm/mpspec.h>
18#include <asm/apic.h> 25#include <asm/apic.h>
19#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
20#endif 28#endif
21 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
22#include "cpu.h" 39#include "cpu.h"
23 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -55,17 +90,150 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
57} }; 92} };
93#endif
58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
59 95
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
61
62static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
63static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
64 99
65struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 asm("pushfl\n\t"
128 "pushfl\n\t"
129 "popl %0\n\t"
130 "movl %0,%1\n\t"
131 "xorl %2,%0\n\t"
132 "pushl %0\n\t"
133 "popfl\n\t"
134 "pushfl\n\t"
135 "popl %0\n\t"
136 "popfl\n\t"
137 : "=&r" (f1), "=&r" (f2)
138 : "ir" (flag));
139
140 return ((f1^f2) & flag) != 0;
141}
142
143/* Probe for the CPUID instruction */
144static int __cpuinit have_cpuid_p(void)
145{
146 return flag_is_changeable_p(X86_EFLAGS_ID);
147}
148
149static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
150{
151 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
152 /* Disable processor serial number */
153 unsigned long lo, hi;
154 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
155 lo |= 0x200000;
156 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
157 printk(KERN_NOTICE "CPU serial number disabled.\n");
158 clear_cpu_cap(c, X86_FEATURE_PN);
159
160 /* Disabling the serial number may affect the cpuid level */
161 c->cpuid_level = cpuid_eax(0);
162 }
163}
164
165static int __init x86_serial_nr_setup(char *s)
166{
167 disable_x86_serial_nr = 0;
168 return 1;
169}
170__setup("serialnumber", x86_serial_nr_setup);
171#else
172static inline int flag_is_changeable_p(u32 flag)
173{
174 return 1;
175}
176/* Probe for the CPUID instruction */
177static inline int have_cpuid_p(void)
178{
179 return 1;
180}
181static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
182{
183}
184#endif
185
186/*
187 * Naming convention should be: <Name> [(<Codename>)]
188 * This table only is used unless init_<vendor>() below doesn't set it;
189 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
190 *
191 */
192
193/* Look up CPU names by table lookup. */
194static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
195{
196 struct cpu_model_info *info;
197
198 if (c->x86_model >= 16)
199 return NULL; /* Range check */
200
201 if (!this_cpu)
202 return NULL;
203
204 info = this_cpu->c_models;
205
206 while (info && info->family) {
207 if (info->family == c->x86)
208 return info->model_names[c->x86_model];
209 info++;
210 }
211 return NULL; /* Not found */
212}
213
214__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
215
216/* Current gdt points %fs at the "master" per-cpu area: after this,
217 * it's on the real one. */
218void switch_to_new_gdt(void)
219{
220 struct desc_ptr gdt_descr;
221
222 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
223 gdt_descr.size = GDT_SIZE - 1;
224 load_gdt(&gdt_descr);
225#ifdef CONFIG_X86_32
226 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
227#endif
228}
229
230static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
66 231
67static void __cpuinit default_init(struct cpuinfo_x86 *c) 232static void __cpuinit default_init(struct cpuinfo_x86 *c)
68{ 233{
234#ifdef CONFIG_X86_64
235 display_cacheinfo(c);
236#else
69 /* Not much we can do here... */ 237 /* Not much we can do here... */
70 /* Check if at least it has cpuid */ 238 /* Check if at least it has cpuid */
71 if (c->cpuid_level == -1) { 239 if (c->cpuid_level == -1) {
@@ -75,28 +243,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
75 else if (c->x86 == 3) 243 else if (c->x86 == 3)
76 strcpy(c->x86_model_id, "386"); 244 strcpy(c->x86_model_id, "386");
77 } 245 }
246#endif
78} 247}
79 248
80static struct cpu_dev __cpuinitdata default_cpu = { 249static struct cpu_dev __cpuinitdata default_cpu = {
81 .c_init = default_init, 250 .c_init = default_init,
82 .c_vendor = "Unknown", 251 .c_vendor = "Unknown",
252 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83}; 253};
84static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
85 254
86static int __init cachesize_setup(char *str) 255static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
87{
88 get_option(&str, &cachesize_override);
89 return 1;
90}
91__setup("cachesize=", cachesize_setup);
92
93int __cpuinit get_model_name(struct cpuinfo_x86 *c)
94{ 256{
95 unsigned int *v; 257 unsigned int *v;
96 char *p, *q; 258 char *p, *q;
97 259
98 if (cpuid_eax(0x80000000) < 0x80000004) 260 if (c->extended_cpuid_level < 0x80000004)
99 return 0; 261 return;
100 262
101 v = (unsigned int *) c->x86_model_id; 263 v = (unsigned int *) c->x86_model_id;
102 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 264 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -115,30 +277,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
115 while (q <= &c->x86_model_id[48]) 277 while (q <= &c->x86_model_id[48])
116 *q++ = '\0'; /* Zero-pad the rest */ 278 *q++ = '\0'; /* Zero-pad the rest */
117 } 279 }
118
119 return 1;
120} 280}
121 281
122
123void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 282void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
124{ 283{
125 unsigned int n, dummy, ecx, edx, l2size; 284 unsigned int n, dummy, ebx, ecx, edx, l2size;
126 285
127 n = cpuid_eax(0x80000000); 286 n = c->extended_cpuid_level;
128 287
129 if (n >= 0x80000005) { 288 if (n >= 0x80000005) {
130 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 289 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
131 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 290 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
132 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 291 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
133 c->x86_cache_size = (ecx>>24)+(edx>>24); 292 c->x86_cache_size = (ecx>>24) + (edx>>24);
293#ifdef CONFIG_X86_64
294 /* On K8 L1 TLB is inclusive, so don't count it */
295 c->x86_tlbsize = 0;
296#endif
134 } 297 }
135 298
136 if (n < 0x80000006) /* Some chips just has a large L1. */ 299 if (n < 0x80000006) /* Some chips just has a large L1. */
137 return; 300 return;
138 301
139 ecx = cpuid_ecx(0x80000006); 302 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
140 l2size = ecx >> 16; 303 l2size = ecx >> 16;
141 304
305#ifdef CONFIG_X86_64
306 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
307#else
142 /* do processor-specific cache resizing */ 308 /* do processor-specific cache resizing */
143 if (this_cpu->c_size_cache) 309 if (this_cpu->c_size_cache)
144 l2size = this_cpu->c_size_cache(c, l2size); 310 l2size = this_cpu->c_size_cache(c, l2size);
@@ -149,116 +315,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
149 315
150 if (l2size == 0) 316 if (l2size == 0)
151 return; /* Again, no L2 cache is possible */ 317 return; /* Again, no L2 cache is possible */
318#endif
152 319
153 c->x86_cache_size = l2size; 320 c->x86_cache_size = l2size;
154 321
155 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 322 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
156 l2size, ecx & 0xFF); 323 l2size, ecx & 0xFF);
157} 324}
158 325
159/* 326void __cpuinit detect_ht(struct cpuinfo_x86 *c)
160 * Naming convention should be: <Name> [(<Codename>)]
161 * This table only is used unless init_<vendor>() below doesn't set it;
162 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
163 *
164 */
165
166/* Look up CPU names by table lookup. */
167static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
168{ 327{
169 struct cpu_model_info *info; 328#ifdef CONFIG_X86_HT
329 u32 eax, ebx, ecx, edx;
330 int index_msb, core_bits;
170 331
171 if (c->x86_model >= 16) 332 if (!cpu_has(c, X86_FEATURE_HT))
172 return NULL; /* Range check */ 333 return;
173 334
174 if (!this_cpu) 335 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
175 return NULL; 336 goto out;
176 337
177 info = this_cpu->c_models; 338 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
339 return;
178 340
179 while (info && info->family) { 341 cpuid(1, &eax, &ebx, &ecx, &edx);
180 if (info->family == c->x86) 342
181 return info->model_names[c->x86_model]; 343 smp_num_siblings = (ebx & 0xff0000) >> 16;
182 info++; 344
345 if (smp_num_siblings == 1) {
346 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
347 } else if (smp_num_siblings > 1) {
348
349 if (smp_num_siblings > NR_CPUS) {
350 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
351 smp_num_siblings);
352 smp_num_siblings = 1;
353 return;
354 }
355
356 index_msb = get_count_order(smp_num_siblings);
357#ifdef CONFIG_X86_64
358 c->phys_proc_id = phys_pkg_id(index_msb);
359#else
360 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
361#endif
362
363 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
364
365 index_msb = get_count_order(smp_num_siblings);
366
367 core_bits = get_count_order(c->x86_max_cores);
368
369#ifdef CONFIG_X86_64
370 c->cpu_core_id = phys_pkg_id(index_msb) &
371 ((1 << core_bits) - 1);
372#else
373 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
374 ((1 << core_bits) - 1);
375#endif
183 } 376 }
184 return NULL; /* Not found */
185}
186 377
378out:
379 if ((c->x86_max_cores * smp_num_siblings) > 1) {
380 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
381 c->phys_proc_id);
382 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
383 c->cpu_core_id);
384 }
385#endif
386}
187 387
188static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 388static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
189{ 389{
190 char *v = c->x86_vendor_id; 390 char *v = c->x86_vendor_id;
191 int i; 391 int i;
192 static int printed; 392 static int printed;
193 393
194 for (i = 0; i < X86_VENDOR_NUM; i++) { 394 for (i = 0; i < X86_VENDOR_NUM; i++) {
195 if (cpu_devs[i]) { 395 if (!cpu_devs[i])
196 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 396 break;
197 (cpu_devs[i]->c_ident[1] && 397
198 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 398 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
199 c->x86_vendor = i; 399 (cpu_devs[i]->c_ident[1] &&
200 if (!early) 400 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
201 this_cpu = cpu_devs[i]; 401 this_cpu = cpu_devs[i];
202 return; 402 c->x86_vendor = this_cpu->c_x86_vendor;
203 } 403 return;
204 } 404 }
205 } 405 }
406
206 if (!printed) { 407 if (!printed) {
207 printed++; 408 printed++;
208 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 409 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
209 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 410 printk(KERN_ERR "CPU: Your system may be unstable.\n");
210 } 411 }
412
211 c->x86_vendor = X86_VENDOR_UNKNOWN; 413 c->x86_vendor = X86_VENDOR_UNKNOWN;
212 this_cpu = &default_cpu; 414 this_cpu = &default_cpu;
213} 415}
214 416
215 417void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
216static int __init x86_fxsr_setup(char *s)
217{
218 setup_clear_cpu_cap(X86_FEATURE_FXSR);
219 setup_clear_cpu_cap(X86_FEATURE_XMM);
220 return 1;
221}
222__setup("nofxsr", x86_fxsr_setup);
223
224
225static int __init x86_sep_setup(char *s)
226{
227 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1;
229}
230__setup("nosep", x86_sep_setup);
231
232
233/* Standard macro to see if a specific flag is changeable */
234static inline int flag_is_changeable_p(u32 flag)
235{
236 u32 f1, f2;
237
238 asm("pushfl\n\t"
239 "pushfl\n\t"
240 "popl %0\n\t"
241 "movl %0,%1\n\t"
242 "xorl %2,%0\n\t"
243 "pushl %0\n\t"
244 "popfl\n\t"
245 "pushfl\n\t"
246 "popl %0\n\t"
247 "popfl\n\t"
248 : "=&r" (f1), "=&r" (f2)
249 : "ir" (flag));
250
251 return ((f1^f2) & flag) != 0;
252}
253
254
255/* Probe for the CPUID instruction */
256static int __cpuinit have_cpuid_p(void)
257{
258 return flag_is_changeable_p(X86_EFLAGS_ID);
259}
260
261void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 418{
263 /* Get vendor name */ 419 /* Get vendor name */
264 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 420 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -267,50 +423,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
267 (unsigned int *)&c->x86_vendor_id[4]); 423 (unsigned int *)&c->x86_vendor_id[4]);
268 424
269 c->x86 = 4; 425 c->x86 = 4;
426 /* Intel-defined flags: level 0x00000001 */
270 if (c->cpuid_level >= 0x00000001) { 427 if (c->cpuid_level >= 0x00000001) {
271 u32 junk, tfms, cap0, misc; 428 u32 junk, tfms, cap0, misc;
272 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 429 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
273 c->x86 = (tfms >> 8) & 15; 430 c->x86 = (tfms >> 8) & 0xf;
274 c->x86_model = (tfms >> 4) & 15; 431 c->x86_model = (tfms >> 4) & 0xf;
432 c->x86_mask = tfms & 0xf;
275 if (c->x86 == 0xf) 433 if (c->x86 == 0xf)
276 c->x86 += (tfms >> 20) & 0xff; 434 c->x86 += (tfms >> 20) & 0xff;
277 if (c->x86 >= 0x6) 435 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 436 c->x86_model += ((tfms >> 16) & 0xf) << 4;
279 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) { 437 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 438 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
439 c->x86_cache_alignment = c->x86_clflush_size;
283 } 440 }
284 } 441 }
285} 442}
286static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 443
444static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
287{ 445{
288 u32 tfms, xlvl; 446 u32 tfms, xlvl;
289 unsigned int ebx; 447 u32 ebx;
290 448
291 memset(&c->x86_capability, 0, sizeof c->x86_capability); 449 /* Intel-defined flags: level 0x00000001 */
292 if (have_cpuid_p()) { 450 if (c->cpuid_level >= 0x00000001) {
293 /* Intel-defined flags: level 0x00000001 */ 451 u32 capability, excap;
294 if (c->cpuid_level >= 0x00000001) { 452 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
295 u32 capability, excap; 453 c->x86_capability[0] = capability;
296 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 454 c->x86_capability[4] = excap;
297 c->x86_capability[0] = capability; 455 }
298 c->x86_capability[4] = excap;
299 }
300 456
301 /* AMD-defined flags: level 0x80000001 */ 457 /* AMD-defined flags: level 0x80000001 */
302 xlvl = cpuid_eax(0x80000000); 458 xlvl = cpuid_eax(0x80000000);
303 if ((xlvl & 0xffff0000) == 0x80000000) { 459 c->extended_cpuid_level = xlvl;
304 if (xlvl >= 0x80000001) { 460 if ((xlvl & 0xffff0000) == 0x80000000) {
305 c->x86_capability[1] = cpuid_edx(0x80000001); 461 if (xlvl >= 0x80000001) {
306 c->x86_capability[6] = cpuid_ecx(0x80000001); 462 c->x86_capability[1] = cpuid_edx(0x80000001);
307 } 463 c->x86_capability[6] = cpuid_ecx(0x80000001);
308 } 464 }
465 }
309 466
467#ifdef CONFIG_X86_64
468 if (c->extended_cpuid_level >= 0x80000008) {
469 u32 eax = cpuid_eax(0x80000008);
470
471 c->x86_virt_bits = (eax >> 8) & 0xff;
472 c->x86_phys_bits = eax & 0xff;
310 } 473 }
474#endif
475
476 if (c->extended_cpuid_level >= 0x80000007)
477 c->x86_power = cpuid_edx(0x80000007);
311 478
312} 479}
313 480
481static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
482{
483#ifdef CONFIG_X86_32
484 int i;
485
486 /*
487 * First of all, decide if this is a 486 or higher
488 * It's a 486 if we can modify the AC flag
489 */
490 if (flag_is_changeable_p(X86_EFLAGS_AC))
491 c->x86 = 4;
492 else
493 c->x86 = 3;
494
495 for (i = 0; i < X86_VENDOR_NUM; i++)
496 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
497 c->x86_vendor_id[0] = 0;
498 cpu_devs[i]->c_identify(c);
499 if (c->x86_vendor_id[0]) {
500 get_cpu_vendor(c);
501 break;
502 }
503 }
504#endif
505}
506
314/* 507/*
315 * Do minimum CPU detection early. 508 * Do minimum CPU detection early.
316 * Fields really needed: vendor, cpuid_level, family, model, mask, 509 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -320,109 +513,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
320 * WARNING: this function is only called on the BP. Don't add code here 513 * WARNING: this function is only called on the BP. Don't add code here
321 * that is supposed to run on all CPUs. 514 * that is supposed to run on all CPUs.
322 */ 515 */
323static void __init early_cpu_detect(void) 516static void __init early_identify_cpu(struct cpuinfo_x86 *c)
324{ 517{
325 struct cpuinfo_x86 *c = &boot_cpu_data; 518#ifdef CONFIG_X86_64
326 519 c->x86_clflush_size = 64;
327 c->x86_cache_alignment = 32; 520#else
328 c->x86_clflush_size = 32; 521 c->x86_clflush_size = 32;
522#endif
523 c->x86_cache_alignment = c->x86_clflush_size;
524
525 memset(&c->x86_capability, 0, sizeof c->x86_capability);
526 c->extended_cpuid_level = 0;
329 527
330 if (!have_cpuid_p()) 528 if (!have_cpuid_p())
529 identify_cpu_without_cpuid(c);
530
531 /* cyrix could have cpuid enabled via c_identify()*/
532 if (!have_cpuid_p())
331 return; 533 return;
332 534
333 cpu_detect(c); 535 cpu_detect(c);
334 536
335 get_cpu_vendor(c, 1); 537 get_cpu_vendor(c);
336 538
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 539 get_cpu_cap(c);
338 cpu_devs[c->x86_vendor]->c_early_init)
339 cpu_devs[c->x86_vendor]->c_early_init(c);
340 540
341 early_get_cap(c); 541 if (this_cpu->c_early_init)
542 this_cpu->c_early_init(c);
543
544 validate_pat_support(c);
342} 545}
343 546
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 547void __init early_cpu_init(void)
345{ 548{
346 u32 tfms, xlvl; 549 struct cpu_dev **cdev;
347 unsigned int ebx; 550 int count = 0;
348 551
349 if (have_cpuid_p()) { 552 printk("KERNEL supported cpus:\n");
350 /* Get vendor name */ 553 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
351 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 554 struct cpu_dev *cpudev = *cdev;
352 (unsigned int *)&c->x86_vendor_id[0], 555 unsigned int j;
353 (unsigned int *)&c->x86_vendor_id[8], 556
354 (unsigned int *)&c->x86_vendor_id[4]); 557 if (count >= X86_VENDOR_NUM)
355 558 break;
356 get_cpu_vendor(c, 0); 559 cpu_devs[count] = cpudev;
357 /* Initialize the standard set of capabilities */ 560 count++;
358 /* Note that the vendor-specific code below might override */ 561
359 /* Intel-defined flags: level 0x00000001 */ 562 for (j = 0; j < 2; j++) {
360 if (c->cpuid_level >= 0x00000001) { 563 if (!cpudev->c_ident[j])
361 u32 capability, excap; 564 continue;
362 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 565 printk(" %s %s\n", cpudev->c_vendor,
363 c->x86_capability[0] = capability; 566 cpudev->c_ident[j]);
364 c->x86_capability[4] = excap;
365 c->x86 = (tfms >> 8) & 15;
366 c->x86_model = (tfms >> 4) & 15;
367 if (c->x86 == 0xf)
368 c->x86 += (tfms >> 20) & 0xff;
369 if (c->x86 >= 0x6)
370 c->x86_model += ((tfms >> 16) & 0xF) << 4;
371 c->x86_mask = tfms & 15;
372 c->initial_apicid = (ebx >> 24) & 0xFF;
373#ifdef CONFIG_X86_HT
374 c->apicid = phys_pkg_id(c->initial_apicid, 0);
375 c->phys_proc_id = c->initial_apicid;
376#else
377 c->apicid = c->initial_apicid;
378#endif
379 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
380 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
381 } else {
382 /* Have CPUID level 0 only - unheard of */
383 c->x86 = 4;
384 }
385
386 /* AMD-defined flags: level 0x80000001 */
387 xlvl = cpuid_eax(0x80000000);
388 if ((xlvl & 0xffff0000) == 0x80000000) {
389 if (xlvl >= 0x80000001) {
390 c->x86_capability[1] = cpuid_edx(0x80000001);
391 c->x86_capability[6] = cpuid_ecx(0x80000001);
392 }
393 if (xlvl >= 0x80000004)
394 get_model_name(c); /* Default name */
395 } 567 }
396
397 init_scattered_cpuid_features(c);
398 } 568 }
399 569
570 early_identify_cpu(&boot_cpu_data);
400} 571}
401 572
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 573/*
574 * The NOPL instruction is supposed to exist on all CPUs with
575 * family >= 6; unfortunately, that's not true in practice because
576 * of early VIA chips and (more importantly) broken virtualizers that
577 * are not easy to detect. In the latter case it doesn't even *fail*
578 * reliably, so probing for it doesn't even work. Disable it completely
579 * unless we can find a reliable way to detect all the broken cases.
580 */
581static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
403{ 582{
404 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 583 clear_cpu_cap(c, X86_FEATURE_NOPL);
405 /* Disable processor serial number */
406 unsigned long lo, hi;
407 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
408 lo |= 0x200000;
409 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
410 printk(KERN_NOTICE "CPU serial number disabled.\n");
411 clear_cpu_cap(c, X86_FEATURE_PN);
412
413 /* Disabling the serial number may affect the cpuid level */
414 c->cpuid_level = cpuid_eax(0);
415 }
416} 584}
417 585
418static int __init x86_serial_nr_setup(char *s) 586static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
419{ 587{
420 disable_x86_serial_nr = 0; 588 c->extended_cpuid_level = 0;
421 return 1; 589
422} 590 if (!have_cpuid_p())
423__setup("serialnumber", x86_serial_nr_setup); 591 identify_cpu_without_cpuid(c);
424 592
593 /* cyrix could have cpuid enabled via c_identify()*/
594 if (!have_cpuid_p())
595 return;
425 596
597 cpu_detect(c);
598
599 get_cpu_vendor(c);
600
601 get_cpu_cap(c);
602
603 if (c->cpuid_level >= 0x00000001) {
604 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
605#ifdef CONFIG_X86_32
606# ifdef CONFIG_X86_HT
607 c->apicid = phys_pkg_id(c->initial_apicid, 0);
608# else
609 c->apicid = c->initial_apicid;
610# endif
611#endif
612
613#ifdef CONFIG_X86_HT
614 c->phys_proc_id = c->initial_apicid;
615#endif
616 }
617
618 get_model_name(c); /* Default name */
619
620 init_scattered_cpuid_features(c);
621 detect_nopl(c);
622}
426 623
427/* 624/*
428 * This does the hard work of actually picking apart the CPU stuff... 625 * This does the hard work of actually picking apart the CPU stuff...
@@ -434,30 +631,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
434 c->loops_per_jiffy = loops_per_jiffy; 631 c->loops_per_jiffy = loops_per_jiffy;
435 c->x86_cache_size = -1; 632 c->x86_cache_size = -1;
436 c->x86_vendor = X86_VENDOR_UNKNOWN; 633 c->x86_vendor = X86_VENDOR_UNKNOWN;
437 c->cpuid_level = -1; /* CPUID not detected */
438 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 634 c->x86_model = c->x86_mask = 0; /* So far unknown... */
439 c->x86_vendor_id[0] = '\0'; /* Unset */ 635 c->x86_vendor_id[0] = '\0'; /* Unset */
440 c->x86_model_id[0] = '\0'; /* Unset */ 636 c->x86_model_id[0] = '\0'; /* Unset */
441 c->x86_max_cores = 1; 637 c->x86_max_cores = 1;
638 c->x86_coreid_bits = 0;
639#ifdef CONFIG_X86_64
640 c->x86_clflush_size = 64;
641#else
642 c->cpuid_level = -1; /* CPUID not detected */
442 c->x86_clflush_size = 32; 643 c->x86_clflush_size = 32;
644#endif
645 c->x86_cache_alignment = c->x86_clflush_size;
443 memset(&c->x86_capability, 0, sizeof c->x86_capability); 646 memset(&c->x86_capability, 0, sizeof c->x86_capability);
444 647
445 if (!have_cpuid_p()) {
446 /*
447 * First of all, decide if this is a 486 or higher
448 * It's a 486 if we can modify the AC flag
449 */
450 if (flag_is_changeable_p(X86_EFLAGS_AC))
451 c->x86 = 4;
452 else
453 c->x86 = 3;
454 }
455
456 generic_identify(c); 648 generic_identify(c);
457 649
458 if (this_cpu->c_identify) 650 if (this_cpu->c_identify)
459 this_cpu->c_identify(c); 651 this_cpu->c_identify(c);
460 652
653#ifdef CONFIG_X86_64
654 c->apicid = phys_pkg_id(0);
655#endif
656
461 /* 657 /*
462 * Vendor-specific initialization. In this section we 658 * Vendor-specific initialization. In this section we
463 * canonicalize the feature flags, meaning if there are 659 * canonicalize the feature flags, meaning if there are
@@ -491,6 +687,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
491 c->x86, c->x86_model); 687 c->x86, c->x86_model);
492 } 688 }
493 689
690#ifdef CONFIG_X86_64
691 detect_ht(c);
692#endif
693
494 /* 694 /*
495 * On SMP, boot_cpu_data holds the common feature set between 695 * On SMP, boot_cpu_data holds the common feature set between
496 * all CPUs; so make sure that we indicate which features are 696 * all CPUs; so make sure that we indicate which features are
@@ -499,7 +699,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
499 */ 699 */
500 if (c != &boot_cpu_data) { 700 if (c != &boot_cpu_data) {
501 /* AND the already accumulated flags with these */ 701 /* AND the already accumulated flags with these */
502 for (i = 0 ; i < NCAPINTS ; i++) 702 for (i = 0; i < NCAPINTS; i++)
503 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 703 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
504 } 704 }
505 705
@@ -507,72 +707,79 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
507 for (i = 0; i < NCAPINTS; i++) 707 for (i = 0; i < NCAPINTS; i++)
508 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 708 c->x86_capability[i] &= ~cleared_cpu_caps[i];
509 709
710#ifdef CONFIG_X86_MCE
510 /* Init Machine Check Exception if available. */ 711 /* Init Machine Check Exception if available. */
511 mcheck_init(c); 712 mcheck_init(c);
713#endif
512 714
513 select_idle_routine(c); 715 select_idle_routine(c);
716
717#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
718 numa_add_cpu(smp_processor_id());
719#endif
514} 720}
515 721
516void __init identify_boot_cpu(void) 722void __init identify_boot_cpu(void)
517{ 723{
518 identify_cpu(&boot_cpu_data); 724 identify_cpu(&boot_cpu_data);
725#ifdef CONFIG_X86_32
519 sysenter_setup(); 726 sysenter_setup();
520 enable_sep_cpu(); 727 enable_sep_cpu();
728#endif
521} 729}
522 730
523void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 731void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
524{ 732{
525 BUG_ON(c == &boot_cpu_data); 733 BUG_ON(c == &boot_cpu_data);
526 identify_cpu(c); 734 identify_cpu(c);
735#ifdef CONFIG_X86_32
527 enable_sep_cpu(); 736 enable_sep_cpu();
737#endif
528 mtrr_ap_init(); 738 mtrr_ap_init();
529} 739}
530 740
531#ifdef CONFIG_X86_HT 741struct msr_range {
532void __cpuinit detect_ht(struct cpuinfo_x86 *c) 742 unsigned min;
533{ 743 unsigned max;
534 u32 eax, ebx, ecx, edx; 744};
535 int index_msb, core_bits;
536
537 cpuid(1, &eax, &ebx, &ecx, &edx);
538
539 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
540 return;
541
542 smp_num_siblings = (ebx & 0xff0000) >> 16;
543 745
544 if (smp_num_siblings == 1) { 746static struct msr_range msr_range_array[] __cpuinitdata = {
545 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 747 { 0x00000000, 0x00000418},
546 } else if (smp_num_siblings > 1) { 748 { 0xc0000000, 0xc000040b},
749 { 0xc0010000, 0xc0010142},
750 { 0xc0011000, 0xc001103b},
751};
547 752
548 if (smp_num_siblings > NR_CPUS) { 753static void __cpuinit print_cpu_msr(void)
549 printk(KERN_WARNING "CPU: Unsupported number of the " 754{
550 "siblings %d", smp_num_siblings); 755 unsigned index;
551 smp_num_siblings = 1; 756 u64 val;
552 return; 757 int i;
758 unsigned index_min, index_max;
759
760 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
761 index_min = msr_range_array[i].min;
762 index_max = msr_range_array[i].max;
763 for (index = index_min; index < index_max; index++) {
764 if (rdmsrl_amd_safe(index, &val))
765 continue;
766 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
553 } 767 }
768 }
769}
554 770
555 index_msb = get_count_order(smp_num_siblings); 771static int show_msr __cpuinitdata;
556 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 772static __init int setup_show_msr(char *arg)
557 773{
558 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 774 int num;
559 c->phys_proc_id);
560
561 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
562
563 index_msb = get_count_order(smp_num_siblings) ;
564 775
565 core_bits = get_count_order(c->x86_max_cores); 776 get_option(&arg, &num);
566 777
567 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 778 if (num > 0)
568 ((1 << core_bits) - 1); 779 show_msr = num;
569 780 return 1;
570 if (c->x86_max_cores > 1)
571 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
572 c->cpu_core_id);
573 }
574} 781}
575#endif 782__setup("show_msr=", setup_show_msr);
576 783
577static __init int setup_noclflush(char *arg) 784static __init int setup_noclflush(char *arg)
578{ 785{
@@ -591,17 +798,25 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
591 vendor = c->x86_vendor_id; 798 vendor = c->x86_vendor_id;
592 799
593 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 800 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
594 printk("%s ", vendor); 801 printk(KERN_CONT "%s ", vendor);
595 802
596 if (!c->x86_model_id[0]) 803 if (c->x86_model_id[0])
597 printk("%d86", c->x86); 804 printk(KERN_CONT "%s", c->x86_model_id);
598 else 805 else
599 printk("%s", c->x86_model_id); 806 printk(KERN_CONT "%d86", c->x86);
600 807
601 if (c->x86_mask || c->cpuid_level >= 0) 808 if (c->x86_mask || c->cpuid_level >= 0)
602 printk(" stepping %02x\n", c->x86_mask); 809 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
603 else 810 else
604 printk("\n"); 811 printk(KERN_CONT "\n");
812
813#ifdef CONFIG_SMP
814 if (c->cpu_index < show_msr)
815 print_cpu_msr();
816#else
817 if (show_msr)
818 print_cpu_msr();
819#endif
605} 820}
606 821
607static __init int setup_disablecpuid(char *arg) 822static __init int setup_disablecpuid(char *arg)
@@ -617,19 +832,89 @@ __setup("clearcpuid=", setup_disablecpuid);
617 832
618cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 833cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
619 834
620void __init early_cpu_init(void) 835#ifdef CONFIG_X86_64
836struct x8664_pda **_cpu_pda __read_mostly;
837EXPORT_SYMBOL(_cpu_pda);
838
839struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
840
841char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
842
843void __cpuinit pda_init(int cpu)
844{
845 struct x8664_pda *pda = cpu_pda(cpu);
846
847 /* Setup up data that may be needed in __get_free_pages early */
848 loadsegment(fs, 0);
849 loadsegment(gs, 0);
850 /* Memory clobbers used to order PDA accessed */
851 mb();
852 wrmsrl(MSR_GS_BASE, pda);
853 mb();
854
855 pda->cpunumber = cpu;
856 pda->irqcount = -1;
857 pda->kernelstack = (unsigned long)stack_thread_info() -
858 PDA_STACKOFFSET + THREAD_SIZE;
859 pda->active_mm = &init_mm;
860 pda->mmu_state = 0;
861
862 if (cpu == 0) {
863 /* others are initialized in smpboot.c */
864 pda->pcurrent = &init_task;
865 pda->irqstackptr = boot_cpu_stack;
866 pda->irqstackptr += IRQSTACKSIZE - 64;
867 } else {
868 if (!pda->irqstackptr) {
869 pda->irqstackptr = (char *)
870 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
871 if (!pda->irqstackptr)
872 panic("cannot allocate irqstack for cpu %d",
873 cpu);
874 pda->irqstackptr += IRQSTACKSIZE - 64;
875 }
876
877 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
878 pda->nodenumber = cpu_to_node(cpu);
879 }
880}
881
882char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
883 DEBUG_STKSZ] __page_aligned_bss;
884
885extern asmlinkage void ignore_sysret(void);
886
887/* May not be marked __init: used by software suspend */
888void syscall_init(void)
621{ 889{
622 struct cpu_vendor_dev *cvdev; 890 /*
891 * LSTAR and STAR live in a bit strange symbiosis.
892 * They both write to the same internal register. STAR allows to
893 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
894 */
895 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
896 wrmsrl(MSR_LSTAR, system_call);
897 wrmsrl(MSR_CSTAR, ignore_sysret);
623 898
624 for (cvdev = __x86cpuvendor_start ; 899#ifdef CONFIG_IA32_EMULATION
625 cvdev < __x86cpuvendor_end ; 900 syscall32_cpu_init();
626 cvdev++) 901#endif
627 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
628 902
629 early_cpu_detect(); 903 /* Flags to clear on syscall */
630 validate_pat_support(&boot_cpu_data); 904 wrmsrl(MSR_SYSCALL_MASK,
905 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
631} 906}
632 907
908unsigned long kernel_eflags;
909
910/*
911 * Copies of the original ist values from the tss are only accessed during
912 * debugging, no special alignment required.
913 */
914DEFINE_PER_CPU(struct orig_ist, orig_ist);
915
916#else
917
633/* Make sure %fs is initialized properly in idle threads */ 918/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 919struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 920{
@@ -637,25 +922,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
637 regs->fs = __KERNEL_PERCPU; 922 regs->fs = __KERNEL_PERCPU;
638 return regs; 923 return regs;
639} 924}
640 925#endif
641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
644{
645 struct desc_ptr gdt_descr;
646
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1;
649 load_gdt(&gdt_descr);
650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
651}
652 926
653/* 927/*
654 * cpu_init() initializes state that is per-CPU. Some data is already 928 * cpu_init() initializes state that is per-CPU. Some data is already
655 * initialized (naturally) in the bootstrap process, such as the GDT 929 * initialized (naturally) in the bootstrap process, such as the GDT
656 * and IDT. We reload them nevertheless, this function acts as a 930 * and IDT. We reload them nevertheless, this function acts as a
657 * 'CPU state barrier', nothing should get across. 931 * 'CPU state barrier', nothing should get across.
932 * A lot of state is already set up in PDA init for 64 bit
658 */ 933 */
934#ifdef CONFIG_X86_64
935void __cpuinit cpu_init(void)
936{
937 int cpu = stack_smp_processor_id();
938 struct tss_struct *t = &per_cpu(init_tss, cpu);
939 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
940 unsigned long v;
941 char *estacks = NULL;
942 struct task_struct *me;
943 int i;
944
945 /* CPU 0 is initialised in head64.c */
946 if (cpu != 0)
947 pda_init(cpu);
948 else
949 estacks = boot_exception_stacks;
950
951 me = current;
952
953 if (cpu_test_and_set(cpu, cpu_initialized))
954 panic("CPU#%d already initialized!\n", cpu);
955
956 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
957
958 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
959
960 /*
961 * Initialize the per-CPU GDT with the boot GDT,
962 * and set up the GDT descriptor:
963 */
964
965 switch_to_new_gdt();
966 load_idt((const struct desc_ptr *)&idt_descr);
967
968 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
969 syscall_init();
970
971 wrmsrl(MSR_FS_BASE, 0);
972 wrmsrl(MSR_KERNEL_GS_BASE, 0);
973 barrier();
974
975 check_efer();
976 if (cpu != 0 && x2apic)
977 enable_x2apic();
978
979 /*
980 * set up and load the per-CPU TSS
981 */
982 if (!orig_ist->ist[0]) {
983 static const unsigned int order[N_EXCEPTION_STACKS] = {
984 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
985 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
986 };
987 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
988 if (cpu) {
989 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
990 if (!estacks)
991 panic("Cannot allocate exception "
992 "stack %ld %d\n", v, cpu);
993 }
994 estacks += PAGE_SIZE << order[v];
995 orig_ist->ist[v] = t->x86_tss.ist[v] =
996 (unsigned long)estacks;
997 }
998 }
999
1000 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1001 /*
1002 * <= is required because the CPU will access up to
1003 * 8 bits beyond the end of the IO permission bitmap.
1004 */
1005 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1006 t->io_bitmap[i] = ~0UL;
1007
1008 atomic_inc(&init_mm.mm_count);
1009 me->active_mm = &init_mm;
1010 if (me->mm)
1011 BUG();
1012 enter_lazy_tlb(&init_mm, me);
1013
1014 load_sp0(t, &current->thread);
1015 set_tss_desc(cpu, t);
1016 load_TR_desc();
1017 load_LDT(&init_mm.context);
1018
1019#ifdef CONFIG_KGDB
1020 /*
1021 * If the kgdb is connected no debug regs should be altered. This
1022 * is only applicable when KGDB and a KGDB I/O module are built
1023 * into the kernel and you are using early debugging with
1024 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1025 */
1026 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1027 arch_kgdb_ops.correct_hw_break();
1028 else {
1029#endif
1030 /*
1031 * Clear all 6 debug registers:
1032 */
1033
1034 set_debugreg(0UL, 0);
1035 set_debugreg(0UL, 1);
1036 set_debugreg(0UL, 2);
1037 set_debugreg(0UL, 3);
1038 set_debugreg(0UL, 6);
1039 set_debugreg(0UL, 7);
1040#ifdef CONFIG_KGDB
1041 /* If the kgdb is connected no debug regs should be altered. */
1042 }
1043#endif
1044
1045 fpu_init();
1046
1047 raw_local_save_flags(kernel_eflags);
1048
1049 if (is_uv_system())
1050 uv_cpu_init();
1051}
1052
1053#else
1054
659void __cpuinit cpu_init(void) 1055void __cpuinit cpu_init(void)
660{ 1056{
661 int cpu = smp_processor_id(); 1057 int cpu = smp_processor_id();
@@ -709,19 +1105,21 @@ void __cpuinit cpu_init(void)
709 /* 1105 /*
710 * Force FPU initialization: 1106 * Force FPU initialization:
711 */ 1107 */
712 current_thread_info()->status = 0; 1108 if (cpu_has_xsave)
1109 current_thread_info()->status = TS_XSAVE;
1110 else
1111 current_thread_info()->status = 0;
713 clear_used_math(); 1112 clear_used_math();
714 mxcsr_feature_mask_init(); 1113 mxcsr_feature_mask_init();
715}
716 1114
717#ifdef CONFIG_HOTPLUG_CPU 1115 /*
718void __cpuinit cpu_uninit(void) 1116 * Boot processor to setup the FP and extended state context info.
719{ 1117 */
720 int cpu = raw_smp_processor_id(); 1118 if (!smp_processor_id())
721 cpu_clear(cpu, cpu_initialized); 1119 init_thread_xstate();
722 1120
723 /* lazy TLB state */ 1121 xsave_init();
724 per_cpu(cpu_tlbstate, cpu).state = 0;
725 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
726} 1122}
1123
1124
727#endif 1125#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index dd6e3f15017e..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,670 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/numa.h>
22#ifdef CONFIG_X86_LOCAL_APIC
23#include <asm/mpspec.h>
24#include <asm/apic.h>
25#include <mach_apic.h>
26#endif
27#include <asm/pda.h>
28#include <asm/pgtable.h>
29#include <asm/processor.h>
30#include <asm/desc.h>
31#include <asm/atomic.h>
32#include <asm/proto.h>
33#include <asm/sections.h>
34#include <asm/setup.h>
35#include <asm/genapic.h>
36
37#include "cpu.h"
38
39/* We need valid kernel segments for data and code in long mode too
40 * IRET will check the segment types kkeil 2000/10/28
41 * Also sysret mandates a special GDT layout
42 */
43/* The TLS descriptors are currently at a different place compared to i386.
44 Hopefully nobody expects them at a fixed place (Wine?) */
45DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
46 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
47 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
48 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
49 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
50 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
51 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
52} };
53EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
54
55__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
56
57/* Current gdt points %fs at the "master" per-cpu area: after this,
58 * it's on the real one. */
59void switch_to_new_gdt(void)
60{
61 struct desc_ptr gdt_descr;
62
63 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
64 gdt_descr.size = GDT_SIZE - 1;
65 load_gdt(&gdt_descr);
66}
67
68struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
69
70static void __cpuinit default_init(struct cpuinfo_x86 *c)
71{
72 display_cacheinfo(c);
73}
74
75static struct cpu_dev __cpuinitdata default_cpu = {
76 .c_init = default_init,
77 .c_vendor = "Unknown",
78};
79static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
80
81int __cpuinit get_model_name(struct cpuinfo_x86 *c)
82{
83 unsigned int *v;
84
85 if (c->extended_cpuid_level < 0x80000004)
86 return 0;
87
88 v = (unsigned int *) c->x86_model_id;
89 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
90 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
91 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
92 c->x86_model_id[48] = 0;
93 return 1;
94}
95
96
97void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
98{
99 unsigned int n, dummy, ebx, ecx, edx;
100
101 n = c->extended_cpuid_level;
102
103 if (n >= 0x80000005) {
104 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
105 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
106 "D cache %dK (%d bytes/line)\n",
107 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
108 c->x86_cache_size = (ecx>>24) + (edx>>24);
109 /* On K8 L1 TLB is inclusive, so don't count it */
110 c->x86_tlbsize = 0;
111 }
112
113 if (n >= 0x80000006) {
114 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
115 ecx = cpuid_ecx(0x80000006);
116 c->x86_cache_size = ecx >> 16;
117 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
118
119 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
120 c->x86_cache_size, ecx & 0xFF);
121 }
122}
123
124void __cpuinit detect_ht(struct cpuinfo_x86 *c)
125{
126#ifdef CONFIG_SMP
127 u32 eax, ebx, ecx, edx;
128 int index_msb, core_bits;
129
130 cpuid(1, &eax, &ebx, &ecx, &edx);
131
132
133 if (!cpu_has(c, X86_FEATURE_HT))
134 return;
135 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
136 goto out;
137
138 smp_num_siblings = (ebx & 0xff0000) >> 16;
139
140 if (smp_num_siblings == 1) {
141 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
142 } else if (smp_num_siblings > 1) {
143
144 if (smp_num_siblings > NR_CPUS) {
145 printk(KERN_WARNING "CPU: Unsupported number of "
146 "siblings %d", smp_num_siblings);
147 smp_num_siblings = 1;
148 return;
149 }
150
151 index_msb = get_count_order(smp_num_siblings);
152 c->phys_proc_id = phys_pkg_id(index_msb);
153
154 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
155
156 index_msb = get_count_order(smp_num_siblings);
157
158 core_bits = get_count_order(c->x86_max_cores);
159
160 c->cpu_core_id = phys_pkg_id(index_msb) &
161 ((1 << core_bits) - 1);
162 }
163out:
164 if ((c->x86_max_cores * smp_num_siblings) > 1) {
165 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
166 c->phys_proc_id);
167 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
168 c->cpu_core_id);
169 }
170
171#endif
172}
173
174static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
175{
176 char *v = c->x86_vendor_id;
177 int i;
178 static int printed;
179
180 for (i = 0; i < X86_VENDOR_NUM; i++) {
181 if (cpu_devs[i]) {
182 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
183 (cpu_devs[i]->c_ident[1] &&
184 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
185 c->x86_vendor = i;
186 this_cpu = cpu_devs[i];
187 return;
188 }
189 }
190 }
191 if (!printed) {
192 printed++;
193 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
194 printk(KERN_ERR "CPU: Your system may be unstable.\n");
195 }
196 c->x86_vendor = X86_VENDOR_UNKNOWN;
197}
198
199static void __init early_cpu_support_print(void)
200{
201 int i,j;
202 struct cpu_dev *cpu_devx;
203
204 printk("KERNEL supported cpus:\n");
205 for (i = 0; i < X86_VENDOR_NUM; i++) {
206 cpu_devx = cpu_devs[i];
207 if (!cpu_devx)
208 continue;
209 for (j = 0; j < 2; j++) {
210 if (!cpu_devx->c_ident[j])
211 continue;
212 printk(" %s %s\n", cpu_devx->c_vendor,
213 cpu_devx->c_ident[j]);
214 }
215 }
216}
217
218static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
219
220void __init early_cpu_init(void)
221{
222 struct cpu_vendor_dev *cvdev;
223
224 for (cvdev = __x86cpuvendor_start ;
225 cvdev < __x86cpuvendor_end ;
226 cvdev++)
227 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
228 early_cpu_support_print();
229 early_identify_cpu(&boot_cpu_data);
230}
231
232/* Do some early cpuid on the boot CPU to get some parameter that are
233 needed before check_bugs. Everything advanced is in identify_cpu
234 below. */
235static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
236{
237 u32 tfms, xlvl;
238
239 c->loops_per_jiffy = loops_per_jiffy;
240 c->x86_cache_size = -1;
241 c->x86_vendor = X86_VENDOR_UNKNOWN;
242 c->x86_model = c->x86_mask = 0; /* So far unknown... */
243 c->x86_vendor_id[0] = '\0'; /* Unset */
244 c->x86_model_id[0] = '\0'; /* Unset */
245 c->x86_clflush_size = 64;
246 c->x86_cache_alignment = c->x86_clflush_size;
247 c->x86_max_cores = 1;
248 c->x86_coreid_bits = 0;
249 c->extended_cpuid_level = 0;
250 memset(&c->x86_capability, 0, sizeof c->x86_capability);
251
252 /* Get vendor name */
253 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
254 (unsigned int *)&c->x86_vendor_id[0],
255 (unsigned int *)&c->x86_vendor_id[8],
256 (unsigned int *)&c->x86_vendor_id[4]);
257
258 get_cpu_vendor(c);
259
260 /* Initialize the standard set of capabilities */
261 /* Note that the vendor-specific code below might override */
262
263 /* Intel-defined flags: level 0x00000001 */
264 if (c->cpuid_level >= 0x00000001) {
265 __u32 misc;
266 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
267 &c->x86_capability[0]);
268 c->x86 = (tfms >> 8) & 0xf;
269 c->x86_model = (tfms >> 4) & 0xf;
270 c->x86_mask = tfms & 0xf;
271 if (c->x86 == 0xf)
272 c->x86 += (tfms >> 20) & 0xff;
273 if (c->x86 >= 0x6)
274 c->x86_model += ((tfms >> 16) & 0xF) << 4;
275 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
276 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
277 } else {
278 /* Have CPUID level 0 only - unheard of */
279 c->x86 = 4;
280 }
281
282 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
283#ifdef CONFIG_SMP
284 c->phys_proc_id = c->initial_apicid;
285#endif
286 /* AMD-defined flags: level 0x80000001 */
287 xlvl = cpuid_eax(0x80000000);
288 c->extended_cpuid_level = xlvl;
289 if ((xlvl & 0xffff0000) == 0x80000000) {
290 if (xlvl >= 0x80000001) {
291 c->x86_capability[1] = cpuid_edx(0x80000001);
292 c->x86_capability[6] = cpuid_ecx(0x80000001);
293 }
294 if (xlvl >= 0x80000004)
295 get_model_name(c); /* Default name */
296 }
297
298 /* Transmeta-defined flags: level 0x80860001 */
299 xlvl = cpuid_eax(0x80860000);
300 if ((xlvl & 0xffff0000) == 0x80860000) {
301 /* Don't set x86_cpuid_level here for now to not confuse. */
302 if (xlvl >= 0x80860001)
303 c->x86_capability[2] = cpuid_edx(0x80860001);
304 }
305
306 if (c->extended_cpuid_level >= 0x80000007)
307 c->x86_power = cpuid_edx(0x80000007);
308
309 if (c->extended_cpuid_level >= 0x80000008) {
310 u32 eax = cpuid_eax(0x80000008);
311
312 c->x86_virt_bits = (eax >> 8) & 0xff;
313 c->x86_phys_bits = eax & 0xff;
314 }
315
316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
317 cpu_devs[c->x86_vendor]->c_early_init)
318 cpu_devs[c->x86_vendor]->c_early_init(c);
319
320 validate_pat_support(c);
321}
322
323/*
324 * This does the hard work of actually picking apart the CPU stuff...
325 */
326static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
327{
328 int i;
329
330 early_identify_cpu(c);
331
332 init_scattered_cpuid_features(c);
333
334 c->apicid = phys_pkg_id(0);
335
336 /*
337 * Vendor-specific initialization. In this section we
338 * canonicalize the feature flags, meaning if there are
339 * features a certain CPU supports which CPUID doesn't
340 * tell us, CPUID claiming incorrect flags, or other bugs,
341 * we handle them here.
342 *
343 * At the end of this section, c->x86_capability better
344 * indicate the features this CPU genuinely supports!
345 */
346 if (this_cpu->c_init)
347 this_cpu->c_init(c);
348
349 detect_ht(c);
350
351 /*
352 * On SMP, boot_cpu_data holds the common feature set between
353 * all CPUs; so make sure that we indicate which features are
354 * common between the CPUs. The first time this routine gets
355 * executed, c == &boot_cpu_data.
356 */
357 if (c != &boot_cpu_data) {
358 /* AND the already accumulated flags with these */
359 for (i = 0; i < NCAPINTS; i++)
360 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
361 }
362
363 /* Clear all flags overriden by options */
364 for (i = 0; i < NCAPINTS; i++)
365 c->x86_capability[i] &= ~cleared_cpu_caps[i];
366
367#ifdef CONFIG_X86_MCE
368 mcheck_init(c);
369#endif
370 select_idle_routine(c);
371
372#ifdef CONFIG_NUMA
373 numa_add_cpu(smp_processor_id());
374#endif
375
376}
377
378void __cpuinit identify_boot_cpu(void)
379{
380 identify_cpu(&boot_cpu_data);
381}
382
383void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
384{
385 BUG_ON(c == &boot_cpu_data);
386 identify_cpu(c);
387 mtrr_ap_init();
388}
389
390static __init int setup_noclflush(char *arg)
391{
392 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
393 return 1;
394}
395__setup("noclflush", setup_noclflush);
396
397void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
398{
399 if (c->x86_model_id[0])
400 printk(KERN_CONT "%s", c->x86_model_id);
401
402 if (c->x86_mask || c->cpuid_level >= 0)
403 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
404 else
405 printk(KERN_CONT "\n");
406}
407
408static __init int setup_disablecpuid(char *arg)
409{
410 int bit;
411 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
412 setup_clear_cpu_cap(bit);
413 else
414 return 0;
415 return 1;
416}
417__setup("clearcpuid=", setup_disablecpuid);
418
419cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
420
421struct x8664_pda **_cpu_pda __read_mostly;
422EXPORT_SYMBOL(_cpu_pda);
423
424struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
425
426char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
427
428unsigned long __supported_pte_mask __read_mostly = ~0UL;
429EXPORT_SYMBOL_GPL(__supported_pte_mask);
430
431static int do_not_nx __cpuinitdata;
432
433/* noexec=on|off
434Control non executable mappings for 64bit processes.
435
436on Enable(default)
437off Disable
438*/
439static int __init nonx_setup(char *str)
440{
441 if (!str)
442 return -EINVAL;
443 if (!strncmp(str, "on", 2)) {
444 __supported_pte_mask |= _PAGE_NX;
445 do_not_nx = 0;
446 } else if (!strncmp(str, "off", 3)) {
447 do_not_nx = 1;
448 __supported_pte_mask &= ~_PAGE_NX;
449 }
450 return 0;
451}
452early_param("noexec", nonx_setup);
453
454int force_personality32;
455
456/* noexec32=on|off
457Control non executable heap for 32bit processes.
458To control the stack too use noexec=off
459
460on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
461off PROT_READ implies PROT_EXEC
462*/
463static int __init nonx32_setup(char *str)
464{
465 if (!strcmp(str, "on"))
466 force_personality32 &= ~READ_IMPLIES_EXEC;
467 else if (!strcmp(str, "off"))
468 force_personality32 |= READ_IMPLIES_EXEC;
469 return 1;
470}
471__setup("noexec32=", nonx32_setup);
472
473void pda_init(int cpu)
474{
475 struct x8664_pda *pda = cpu_pda(cpu);
476
477 /* Setup up data that may be needed in __get_free_pages early */
478 loadsegment(fs, 0);
479 loadsegment(gs, 0);
480 /* Memory clobbers used to order PDA accessed */
481 mb();
482 wrmsrl(MSR_GS_BASE, pda);
483 mb();
484
485 pda->cpunumber = cpu;
486 pda->irqcount = -1;
487 pda->kernelstack = (unsigned long)stack_thread_info() -
488 PDA_STACKOFFSET + THREAD_SIZE;
489 pda->active_mm = &init_mm;
490 pda->mmu_state = 0;
491
492 if (cpu == 0) {
493 /* others are initialized in smpboot.c */
494 pda->pcurrent = &init_task;
495 pda->irqstackptr = boot_cpu_stack;
496 } else {
497 pda->irqstackptr = (char *)
498 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
499 if (!pda->irqstackptr)
500 panic("cannot allocate irqstack for cpu %d", cpu);
501
502 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
503 pda->nodenumber = cpu_to_node(cpu);
504 }
505
506 pda->irqstackptr += IRQSTACKSIZE-64;
507}
508
509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
510 DEBUG_STKSZ] __page_aligned_bss;
511
512extern asmlinkage void ignore_sysret(void);
513
514/* May not be marked __init: used by software suspend */
515void syscall_init(void)
516{
517 /*
518 * LSTAR and STAR live in a bit strange symbiosis.
519 * They both write to the same internal register. STAR allows to
520 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
521 */
522 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
523 wrmsrl(MSR_LSTAR, system_call);
524 wrmsrl(MSR_CSTAR, ignore_sysret);
525
526#ifdef CONFIG_IA32_EMULATION
527 syscall32_cpu_init();
528#endif
529
530 /* Flags to clear on syscall */
531 wrmsrl(MSR_SYSCALL_MASK,
532 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
533}
534
535void __cpuinit check_efer(void)
536{
537 unsigned long efer;
538
539 rdmsrl(MSR_EFER, efer);
540 if (!(efer & EFER_NX) || do_not_nx)
541 __supported_pte_mask &= ~_PAGE_NX;
542}
543
544unsigned long kernel_eflags;
545
546/*
547 * Copies of the original ist values from the tss are only accessed during
548 * debugging, no special alignment required.
549 */
550DEFINE_PER_CPU(struct orig_ist, orig_ist);
551
552/*
553 * cpu_init() initializes state that is per-CPU. Some data is already
554 * initialized (naturally) in the bootstrap process, such as the GDT
555 * and IDT. We reload them nevertheless, this function acts as a
556 * 'CPU state barrier', nothing should get across.
557 * A lot of state is already set up in PDA init.
558 */
559void __cpuinit cpu_init(void)
560{
561 int cpu = stack_smp_processor_id();
562 struct tss_struct *t = &per_cpu(init_tss, cpu);
563 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
564 unsigned long v;
565 char *estacks = NULL;
566 struct task_struct *me;
567 int i;
568
569 /* CPU 0 is initialised in head64.c */
570 if (cpu != 0)
571 pda_init(cpu);
572 else
573 estacks = boot_exception_stacks;
574
575 me = current;
576
577 if (cpu_test_and_set(cpu, cpu_initialized))
578 panic("CPU#%d already initialized!\n", cpu);
579
580 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
581
582 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
583
584 /*
585 * Initialize the per-CPU GDT with the boot GDT,
586 * and set up the GDT descriptor:
587 */
588
589 switch_to_new_gdt();
590 load_idt((const struct desc_ptr *)&idt_descr);
591
592 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
593 syscall_init();
594
595 wrmsrl(MSR_FS_BASE, 0);
596 wrmsrl(MSR_KERNEL_GS_BASE, 0);
597 barrier();
598
599 check_efer();
600
601 /*
602 * set up and load the per-CPU TSS
603 */
604 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
605 static const unsigned int order[N_EXCEPTION_STACKS] = {
606 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
607 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
608 };
609 if (cpu) {
610 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
611 if (!estacks)
612 panic("Cannot allocate exception stack %ld %d\n",
613 v, cpu);
614 }
615 estacks += PAGE_SIZE << order[v];
616 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
617 }
618
619 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
620 /*
621 * <= is required because the CPU will access up to
622 * 8 bits beyond the end of the IO permission bitmap.
623 */
624 for (i = 0; i <= IO_BITMAP_LONGS; i++)
625 t->io_bitmap[i] = ~0UL;
626
627 atomic_inc(&init_mm.mm_count);
628 me->active_mm = &init_mm;
629 if (me->mm)
630 BUG();
631 enter_lazy_tlb(&init_mm, me);
632
633 load_sp0(t, &current->thread);
634 set_tss_desc(cpu, t);
635 load_TR_desc();
636 load_LDT(&init_mm.context);
637
638#ifdef CONFIG_KGDB
639 /*
640 * If the kgdb is connected no debug regs should be altered. This
641 * is only applicable when KGDB and a KGDB I/O module are built
642 * into the kernel and you are using early debugging with
643 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
644 */
645 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
646 arch_kgdb_ops.correct_hw_break();
647 else {
648#endif
649 /*
650 * Clear all 6 debug registers:
651 */
652
653 set_debugreg(0UL, 0);
654 set_debugreg(0UL, 1);
655 set_debugreg(0UL, 2);
656 set_debugreg(0UL, 3);
657 set_debugreg(0UL, 6);
658 set_debugreg(0UL, 7);
659#ifdef CONFIG_KGDB
660 /* If the kgdb is connected no debug regs should be altered. */
661 }
662#endif
663
664 fpu_init();
665
666 raw_local_save_flags(kernel_eflags);
667
668 if (is_uv_system())
669 uv_cpu_init();
670}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ff2fff56f0a8..c24c4a487b7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,12 +200,10 @@ static void drv_read(struct drv_cmd *cmd)
200static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
201{ 201{
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 cpumask_of_cpu_ptr_declare(cpu_mask);
204 unsigned int i; 203 unsigned int i;
205 204
206 for_each_cpu_mask_nr(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
207 cpumask_of_cpu_ptr_next(cpu_mask, i); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
208 set_cpus_allowed_ptr(current, cpu_mask);
209 do_drv_write(cmd); 207 do_drv_write(cmd);
210 } 208 }
211 209
@@ -258,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
258 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
259 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
260 */ 258 */
261static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
262{ 261{
263 union { 262 union {
264 struct { 263 struct {
@@ -269,12 +268,11 @@ static unsigned int get_measured_perf(unsigned int cpu)
269 } aperf_cur, mperf_cur; 268 } aperf_cur, mperf_cur;
270 269
271 cpumask_t saved_mask; 270 cpumask_t saved_mask;
272 cpumask_of_cpu_ptr(cpu_mask, cpu);
273 unsigned int perf_percent; 271 unsigned int perf_percent;
274 unsigned int retval; 272 unsigned int retval;
275 273
276 saved_mask = current->cpus_allowed; 274 saved_mask = current->cpus_allowed;
277 set_cpus_allowed_ptr(current, cpu_mask); 275 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
278 if (get_cpu() != cpu) { 276 if (get_cpu() != cpu) {
279 /* We were not able to run on requested processor */ 277 /* We were not able to run on requested processor */
280 put_cpu(); 278 put_cpu();
@@ -329,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
329 327
330#endif 328#endif
331 329
332 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
333 331
334 put_cpu(); 332 put_cpu();
335 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -340,7 +338,6 @@ static unsigned int get_measured_perf(unsigned int cpu)
340 338
341static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 339static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
342{ 340{
343 cpumask_of_cpu_ptr(cpu_mask, cpu);
344 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 341 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
345 unsigned int freq; 342 unsigned int freq;
346 unsigned int cached_freq; 343 unsigned int cached_freq;
@@ -353,7 +350,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
353 } 350 }
354 351
355 cached_freq = data->freq_table[data->acpi_data->state].frequency; 352 cached_freq = data->freq_table[data->acpi_data->state].frequency;
356 freq = extract_freq(get_cur_val(cpu_mask), data); 353 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
357 if (freq != cached_freq) { 354 if (freq != cached_freq) {
358 /* 355 /*
359 * The dreaded BIOS frequency change behind our back. 356 * The dreaded BIOS frequency change behind our back.
@@ -789,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
789 if (ret) 786 if (ret)
790 return ret; 787 return ret;
791 788
792 return cpufreq_register_driver(&acpi_cpufreq_driver); 789 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
790 if (ret)
791 free_percpu(acpi_perf_data);
792
793 return ret;
793} 794}
794 795
795static void __exit acpi_cpufreq_exit(void) 796static void __exit acpi_cpufreq_exit(void)
@@ -799,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
799 cpufreq_unregister_driver(&acpi_cpufreq_driver); 800 cpufreq_unregister_driver(&acpi_cpufreq_driver);
800 801
801 free_percpu(acpi_perf_data); 802 free_percpu(acpi_perf_data);
802
803 return;
804} 803}
805 804
806module_param(acpi_pstate_strict, uint, 0644); 805module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fbd..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 53c7b6936973..84bb395038d8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 66 return 800 + (fid * 100);
67} 67}
68 68
69
70/* Return a frequency in KHz, given an input fid */ 69/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 70static u32 find_khz_freq_from_fid(u32 fid)
72{ 71{
@@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 77 return data[pstate].frequency;
79} 78}
80 79
81
82/* Return the vco fid for an input fid 80/* Return the vco fid for an input fid
83 * 81 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 82 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +164,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 164 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 165}
168 166
169
170/* write the new fid value along with the other control fields to the msr */ 167/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 168static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 169{
@@ -479,12 +476,11 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
479static int check_supported_cpu(unsigned int cpu) 476static int check_supported_cpu(unsigned int cpu)
480{ 477{
481 cpumask_t oldmask; 478 cpumask_t oldmask;
482 cpumask_of_cpu_ptr(cpu_mask, cpu);
483 u32 eax, ebx, ecx, edx; 479 u32 eax, ebx, ecx, edx;
484 unsigned int rc = 0; 480 unsigned int rc = 0;
485 481
486 oldmask = current->cpus_allowed; 482 oldmask = current->cpus_allowed;
487 set_cpus_allowed_ptr(current, cpu_mask); 483 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
488 484
489 if (smp_processor_id() != cpu) { 485 if (smp_processor_id() != cpu) {
490 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); 486 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -1017,7 +1013,6 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1013static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1018{ 1014{
1019 cpumask_t oldmask; 1015 cpumask_t oldmask;
1020 cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
1021 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1016 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1022 u32 checkfid; 1017 u32 checkfid;
1023 u32 checkvid; 1018 u32 checkvid;
@@ -1032,7 +1027,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1032 1027
1033 /* only run on specific CPU from here on */ 1028 /* only run on specific CPU from here on */
1034 oldmask = current->cpus_allowed; 1029 oldmask = current->cpus_allowed;
1035 set_cpus_allowed_ptr(current, cpu_mask); 1030 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1036 1031
1037 if (smp_processor_id() != pol->cpu) { 1032 if (smp_processor_id() != pol->cpu) {
1038 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1033 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1107,7 +1102,6 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1107{ 1102{
1108 struct powernow_k8_data *data; 1103 struct powernow_k8_data *data;
1109 cpumask_t oldmask; 1104 cpumask_t oldmask;
1110 cpumask_of_cpu_ptr_declare(newmask);
1111 int rc; 1105 int rc;
1112 1106
1113 if (!cpu_online(pol->cpu)) 1107 if (!cpu_online(pol->cpu))
@@ -1159,8 +1153,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1159 1153
1160 /* only run on specific CPU from here on */ 1154 /* only run on specific CPU from here on */
1161 oldmask = current->cpus_allowed; 1155 oldmask = current->cpus_allowed;
1162 cpumask_of_cpu_ptr_next(newmask, pol->cpu); 1156 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
1163 set_cpus_allowed_ptr(current, newmask);
1164 1157
1165 if (smp_processor_id() != pol->cpu) { 1158 if (smp_processor_id() != pol->cpu) {
1166 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1159 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1182,7 +1175,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1182 set_cpus_allowed_ptr(current, &oldmask); 1175 set_cpus_allowed_ptr(current, &oldmask);
1183 1176
1184 if (cpu_family == CPU_HW_PSTATE) 1177 if (cpu_family == CPU_HW_PSTATE)
1185 pol->cpus = *newmask; 1178 pol->cpus = cpumask_of_cpu(pol->cpu);
1186 else 1179 else
1187 pol->cpus = per_cpu(cpu_core_map, pol->cpu); 1180 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1188 data->available_cores = &(pol->cpus); 1181 data->available_cores = &(pol->cpus);
@@ -1248,7 +1241,6 @@ static unsigned int powernowk8_get (unsigned int cpu)
1248{ 1241{
1249 struct powernow_k8_data *data; 1242 struct powernow_k8_data *data;
1250 cpumask_t oldmask = current->cpus_allowed; 1243 cpumask_t oldmask = current->cpus_allowed;
1251 cpumask_of_cpu_ptr(newmask, cpu);
1252 unsigned int khz = 0; 1244 unsigned int khz = 0;
1253 unsigned int first; 1245 unsigned int first;
1254 1246
@@ -1258,7 +1250,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1258 if (!data) 1250 if (!data)
1259 return -EINVAL; 1251 return -EINVAL;
1260 1252
1261 set_cpus_allowed_ptr(current, newmask); 1253 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
1262 if (smp_processor_id() != cpu) { 1254 if (smp_processor_id() != cpu) {
1263 printk(KERN_ERR PFX 1255 printk(KERN_ERR PFX
1264 "limiting to CPU %d failed in powernowk8_get\n", cpu); 1256 "limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index ca2ac13b7af2..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
@@ -324,10 +324,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
324 unsigned l, h; 324 unsigned l, h;
325 unsigned clock_freq; 325 unsigned clock_freq;
326 cpumask_t saved_mask; 326 cpumask_t saved_mask;
327 cpumask_of_cpu_ptr(new_mask, cpu);
328 327
329 saved_mask = current->cpus_allowed; 328 saved_mask = current->cpus_allowed;
330 set_cpus_allowed_ptr(current, new_mask); 329 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
331 if (smp_processor_id() != cpu) 330 if (smp_processor_id() != cpu)
332 return 0; 331 return 0;
333 332
@@ -585,15 +584,12 @@ static int centrino_target (struct cpufreq_policy *policy,
585 * Best effort undo.. 584 * Best effort undo..
586 */ 585 */
587 586
588 if (!cpus_empty(*covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
589 cpumask_of_cpu_ptr_declare(new_mask);
590
591 for_each_cpu_mask_nr(j, *covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
592 cpumask_of_cpu_ptr_next(new_mask, j); 589 set_cpus_allowed_ptr(current,
593 set_cpus_allowed_ptr(current, new_mask); 590 &cpumask_of_cpu(j));
594 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
595 } 592 }
596 }
597 593
598 tmp = freqs.new; 594 tmp = freqs.new;
599 freqs.new = freqs.old; 595 freqs.new = freqs.old;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2f3728dc24f6..191f7263c61d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,8 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 cpumask_of_cpu_ptr(newmask, cpu); 247 return _speedstep_get(&cpumask_of_cpu(cpu));
248 return _speedstep_get(newmask);
249} 248}
250 249
251/** 250/**
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
116 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
117 122
118 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
119 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
120 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
121 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
122 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
127 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
128 133
129 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
130 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
131 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
132 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
133 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
135}
136
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152} 140}
153 141
154/* 142/*
@@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void)
162 local_irq_save(flags); 150 local_irq_save(flags);
163 151
164 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
165 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
166 154
167 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
168 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
169 157
170 158
171 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
172 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
173 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
286 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
287 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
288 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
289 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
290 295
291 /* 296 /*
292 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
296 */ 301 */
297 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
298 geode_configure(); 303 geode_configure();
299 get_model_name(c); /* get CPU marketing name */
300 return; 304 return;
301 } else { /* MediaGX */ 305 } else { /* MediaGX */
302 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
309 if (dir1 > 7) { 313 if (dir1 > 7) {
310 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
311 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
312 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
313 } else { 317 } else {
314 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
315 } 319 }
@@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
424 local_irq_save(flags); 428 local_irq_save(flags);
425 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
426 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
427 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
428 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
429 local_irq_restore(flags); 433 local_irq_restore(flags);
430 } 434 }
@@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 442 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
439}; 445};
440 446
441cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
442 448
443static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
444 .c_vendor = "NSC", 450 .c_vendor = "NSC",
445 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
446 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
447}; 454};
448 455
449cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index e43ad4ad4cba..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b75f2569b8f8..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
80} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182
183 if (p)
184 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189 301
190 /* Work around errata */ 302 if (p)
191 Intel_errata_workarounds(c); 303 strcpy(c->x86_model_id, p);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226 310
227 if (cpu_has_bts) 311 if (cpu_has_bts)
228 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
229 313
230 /* 314#endif
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239 315
240#ifdef CONFIG_X86_NUMAQ 316 detect_extended_topology(c);
241 numaq_tsc_disable(); 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
242#endif 325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
243} 333}
244 334
335#ifdef CONFIG_X86_32
245static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
246{ 337{
247 /* 338 /*
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
254 size = 256; 345 size = 256;
255 return size; 346 return size;
256} 347}
348#endif
257 349
258static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
259 .c_vendor = "Intel", 351 .c_vendor = "Intel",
260 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
261 .c_models = { 354 .c_models = {
262 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
263 { 356 {
@@ -307,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
307 } 400 }
308 }, 401 },
309 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
310 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
311 .c_init = init_intel, 406 .c_init = init_intel,
312 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
313}; 408};
314 409
315cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
316
317#ifndef CONFIG_X86_CMPXCHG
318unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
319{
320 u8 prev;
321 unsigned long flags;
322
323 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
324 local_irq_save(flags);
325 prev = *(u8 *)ptr;
326 if (prev == old)
327 *(u8 *)ptr = new;
328 local_irq_restore(flags);
329 return prev;
330}
331EXPORT_SYMBOL(cmpxchg_386_u8);
332
333unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
334{
335 u16 prev;
336 unsigned long flags;
337
338 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
339 local_irq_save(flags);
340 prev = *(u16 *)ptr;
341 if (prev == old)
342 *(u16 *)ptr = new;
343 local_irq_restore(flags);
344 return prev;
345}
346EXPORT_SYMBOL(cmpxchg_386_u16);
347
348unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
349{
350 u32 prev;
351 unsigned long flags;
352
353 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
354 local_irq_save(flags);
355 prev = *(u32 *)ptr;
356 if (prev == old)
357 *(u32 *)ptr = new;
358 local_irq_restore(flags);
359 return prev;
360}
361EXPORT_SYMBOL(cmpxchg_386_u32);
362#endif
363
364#ifndef CONFIG_X86_CMPXCHG64
365unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
366{
367 u64 prev;
368 unsigned long flags;
369
370 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
371 local_irq_save(flags);
372 prev = *(u64 *)ptr;
373 if (prev == old)
374 *(u64 *)ptr = new;
375 local_irq_restore(flags);
376 return prev;
377}
378EXPORT_SYMBOL(cmpxchg_486_u64);
379#endif
380
381/* arch_initcall(intel_cpu_init); */
382 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 650d40f7912b..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
130 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
131 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
132 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
133 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
134}; 136};
135 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
136unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
137 147
138/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
182static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
183static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
184 194
185static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
186 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
187 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
188{ 199{
189 unsigned dummy; 200 unsigned dummy;
190 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
251 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
252} 263}
253 264
254static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
255{ 275{
256 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
257 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
258 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
259 unsigned edx; 279 unsigned edx;
260 280
261 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
262 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
263 else 283 if (boot_cpu_data.x86 >= 0x10)
264 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
265 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
266 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
267 291
268 this_leaf->eax = eax; 292 this_leaf->eax = eax;
269 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
270 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
271 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
272 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
273 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
274 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
275 return 0; 299 return 0;
276} 300}
277 301
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
453 477
454/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
455static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
456#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
457 481
458#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
459static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
490 514
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 519 }
496} 520}
@@ -516,7 +540,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
516 unsigned long j; 540 unsigned long j;
517 int retval; 541 int retval;
518 cpumask_t oldmask; 542 cpumask_t oldmask;
519 cpumask_of_cpu_ptr(newmask, cpu);
520 543
521 if (num_cache_leaves == 0) 544 if (num_cache_leaves == 0)
522 return -ENOENT; 545 return -ENOENT;
@@ -527,7 +550,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
527 return -ENOMEM; 550 return -ENOMEM;
528 551
529 oldmask = current->cpus_allowed; 552 oldmask = current->cpus_allowed;
530 retval = set_cpus_allowed_ptr(current, newmask); 553 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
531 if (retval) 554 if (retval)
532 goto out; 555 goto out;
533 556
@@ -573,7 +596,7 @@ struct _index_kobject {
573 596
574/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
575static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
576#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
577 600
578#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
579static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -638,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
638 } 661 }
639} 662}
640 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
641struct _cache_attr { 757struct _cache_attr {
642 struct attribute attr; 758 struct attribute attr;
643 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -658,6 +774,8 @@ define_one_ro(size);
658define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
659define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
660 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
661static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
662 &type.attr, 780 &type.attr,
663 &level.attr, 781 &level.attr,
@@ -668,12 +786,10 @@ static struct attribute * default_attrs[] = {
668 &size.attr, 786 &size.attr,
669 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
670 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
671 NULL 790 NULL
672}; 791};
673 792
674#define to_object(k) container_of(k, struct _index_kobject, kobj)
675#define to_attr(a) container_of(a, struct _cache_attr, attr)
676
677static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
678{ 794{
679 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -683,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
683 ret = fattr->show ? 799 ret = fattr->show ?
684 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
685 buf) : 801 buf) :
686 0; 802 0;
687 return ret; 803 return ret;
688} 804}
689 805
690static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
691 const char * buf, size_t count) 807 const char * buf, size_t count)
692{ 808{
693 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
694} 818}
695 819
696static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 65a339678ece..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -759,6 +759,7 @@ static struct sysdev_class mce_sysclass = {
759}; 759};
760 760
761DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
762 763
763/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
@@ -859,7 +860,7 @@ error:
859 return err; 860 return err;
860} 861}
861 862
862static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
863{ 864{
864 int i; 865 int i;
865 866
@@ -883,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
883 case CPU_ONLINE: 884 case CPU_ONLINE:
884 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
885 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
886 break; 889 break;
887 case CPU_DEAD: 890 case CPU_DEAD:
888 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
889 mce_remove_device(cpu); 894 mce_remove_device(cpu);
890 break; 895 break;
891 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 88736cadbaa6..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
379 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
380{ 380{
381 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
382 383
383 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
384 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
392 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
393 394
394 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
395 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
396 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405 mask_lo = tmp;
406 }
407 }
397 408
398 /* This works correctly if size is a power of two, i.e. a 409 /* This works correctly if size is a power of two, i.e. a
399 contiguous range. */ 410 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6f23969c8faf..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
729 mtrr_type type; 729 mtrr_type type;
730}; 730};
731 731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print; 733static int __initdata debug_print;
734 734
735static int __init 735static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
@@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1496 1600
1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1498 if (!highest_pfn) { 1602 if (!highest_pfn) {
1499 if (!kvm_para_available()) { 1603 WARN(!kvm_para_available(), KERN_WARNING
1500 printk(KERN_WARNING
1501 "WARNING: strange, CPU MTRRs all blank?\n"); 1604 "WARNING: strange, CPU MTRRs all blank?\n");
1502 WARN_ON(1);
1503 }
1504 return 0; 1605 return 0;
1505 } 1606 }
1506 1607
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index de7439f82b92..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 298
299 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 300 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 301 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
307 apic_write(APIC_LVTPC, APIC_DM_NMI);
308 evntsel |= K7_EVNTSEL_ENABLE;
309 wrmsr(evntsel_msr, evntsel, 0);
310
305 return 1; 311 return 1;
306} 312}
307 313
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 388
389 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 390 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 391 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
397 apic_write(APIC_LVTPC, APIC_DM_NMI);
398 evntsel |= P6_EVNTSEL0_ENABLE;
399 wrmsr(evntsel_msr, evntsel, 0);
400
389 return 1; 401 return 1;
390} 402}
391 403
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 444#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 445#define P4_CCCR_OVF (1 << 31)
434 446
447#define P4_CONTROLS 18
448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
435/* 468/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 470 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,12 +506,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
476 } else { 529 } else {
477 /* logical cpu 1 */ 530 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0; 532 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1; 533 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 534
535 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
536 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
537 cccr_val = P4_CCCR_OVF_PMI0;
538 else
539 cccr_val = P4_CCCR_OVF_PMI1;
540 cccr_val |= P4_CCCR_ESCR_SELECT(4);
482 } 541 }
483 542
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 543 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -493,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
493 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
494 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
495 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
496 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
497 cccr_val |= P4_CCCR_ENABLE;
498 wrmsr(cccr_msr, cccr_val, 0);
499 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
500 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
501 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
502 return 1; 566 return 1;
503} 567}
504 568
@@ -614,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
614 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
615 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
616 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
617 apic_write(APIC_LVTPC, APIC_DM_NMI);
618 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
619 wrmsr(evntsel_msr, evntsel, 0);
620 681
621 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
622 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
623 wd->cccr_msr = 0; /* unused */ 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
625 return 1; 693 return 1;
626} 694}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 14b11b3be31c..6a44d6465991 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
89 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
90 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
91 u64 pos = *ppos; 90 u64 pos = *ppos;
91 ssize_t bytes = 0;
92 int err = 0;
92 93
93 if (count % 16) 94 if (count % 16)
94 return -EINVAL; /* Invalid chunk size */ 95 return -EINVAL; /* Invalid chunk size */
@@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
96 for (; count; count -= 16) { 97 for (; count; count -= 16) {
97 cmd.eax = pos; 98 cmd.eax = pos;
98 cmd.ecx = pos >> 32; 99 cmd.ecx = pos >> 32;
99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); 100 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
100 if (copy_to_user(tmp, &cmd, 16)) 101 if (err)
101 return -EFAULT; 102 break;
103 if (copy_to_user(tmp, &cmd, 16)) {
104 err = -EFAULT;
105 break;
106 }
102 tmp += 16; 107 tmp += 16;
108 bytes += 16;
103 *ppos = ++pos; 109 *ppos = ++pos;
104 } 110 }
105 111
106 return tmp - buf; 112 return bytes ? bytes : err;
107} 113}
108 114
109static int cpuid_open(struct inode *inode, struct file *file) 115static int cpuid_open(struct inode *inode, struct file *file)
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10 10#include <linux/uaccess.h>
11#include <asm/uaccess.h> 11#include <linux/io.h>
12#include <asm/io.h>
13 12
14/** 13/**
15 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 24 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 25 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 26ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 27 size_t csize, unsigned long offset, int userbuf)
29{ 28{
30 void *vaddr; 29 void *vaddr;
31 30
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 32 return 0;
34 33
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
36 37
37 if (userbuf) { 38 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 40 iounmap(vaddr);
40 return -EFAULT; 41 return -EFAULT;
41 } 42 }
42 } else 43 } else
43 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
44 45
45 iounmap(vaddr); 46 iounmap(vaddr);
46 return csize; 47 return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..395acb12b0d1 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir)
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9af89078f7bb..78e642feac30 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -1203,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
1203 if (!p) 1206 if (!p)
1204 return -EINVAL; 1207 return -EINVAL;
1205 1208
1206 if (!strcmp(p, "exactmap")) { 1209 if (!strncmp(p, "exactmap", 8)) {
1207#ifdef CONFIG_CRASH_DUMP 1210#ifdef CONFIG_CRASH_DUMP
1208 /* 1211 /*
1209 * If we are doing a crash dump, we still need to know 1212 * If we are doing a crash dump, we still need to know
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1260 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1261 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1262 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1263 default: return "reserved"; 1267 default: return "reserved";
1264 } 1268 }
1265} 1269}
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1267/* 1271/*
1268 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1269 */ 1273 */
1274static struct resource __initdata *e820_res;
1270void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1271{ 1276{
1272 int i; 1277 int i;
@@ -1274,6 +1279,7 @@ void __init e820_reserve_resources(void)
1274 u64 end; 1279 u64 end;
1275 1280
1276 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1277 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1278 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1279#ifndef CONFIG_RESOURCES_64BIT 1285#ifndef CONFIG_RESOURCES_64BIT
@@ -1287,7 +1293,14 @@ void __init e820_reserve_resources(void)
1287 res->end = end; 1293 res->end = end;
1288 1294
1289 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1295 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1290 insert_resource(&iomem_resource, res); 1296
1297 /*
1298 * don't register the region that could be conflicted with
1299 * pci device BAR resource and insert them later in
1300 * pcibios_resource_survey()
1301 */
1302 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1303 insert_resource(&iomem_resource, res);
1291 res++; 1304 res++;
1292 } 1305 }
1293 1306
@@ -1299,6 +1312,19 @@ void __init e820_reserve_resources(void)
1299 } 1312 }
1300} 1313}
1301 1314
1315void __init e820_reserve_resources_late(void)
1316{
1317 int i;
1318 struct resource *res;
1319
1320 res = e820_res;
1321 for (i = 0; i < e820.nr_map; i++) {
1322 if (!res->parent && res->end)
1323 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1324 res++;
1325 }
1326}
1327
1302char *__init default_machine_specific_memory_setup(void) 1328char *__init default_machine_specific_memory_setup(void)
1303{ 1329{
1304 char *who = "BIOS-e820"; 1330 char *who = "BIOS-e820";
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fac..733c4f8d42ea 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,66 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98static u32 ati_ixp4x0_rev(int num, int slot, int func)
99{
100 u32 d;
101 u8 b;
102
103 b = read_pci_config_byte(num, slot, func, 0xac);
104 b &= ~(1<<5);
105 write_pci_config_byte(num, slot, func, 0xac, b);
106
107 d = read_pci_config(num, slot, func, 0x70);
108 d |= 1<<8;
109 write_pci_config(num, slot, func, 0x70, d);
110
111 d = read_pci_config(num, slot, func, 0x8);
112 d &= 0xff;
113 return d;
114}
115
116static void __init ati_bugs(int num, int slot, int func)
117{
118#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141#endif
142}
143
144#ifdef CONFIG_DMAR
145static void __init intel_g33_dmar(int num, int slot, int func)
146{
147 struct acpi_table_header *dmar_tbl;
148 acpi_status status;
149
150 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
151 if (ACPI_SUCCESS(status)) {
152 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
153 dmar_disabled = 1;
154 }
155}
156#endif
157
98#define QFLAG_APPLY_ONCE 0x1 158#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 159#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 160#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +174,12 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 174 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 175 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 176 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
177 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
178 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
179#ifdef CONFIG_DMAR
180 { PCI_VENDOR_ID_INTEL, 0x29c0,
181 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
182#endif
117 {} 183 {}
118}; 184};
119 185
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 414 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
420 if (add_efi_memmap) 422 if (add_efi_memmap)
421 do_add_efi_memmap(); 423 do_add_efi_memmap();
422 424
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
53 * directory. If I have PAE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PAE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PAE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89434d439605..cf3a0b2d0059 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64)
275ENTRY(ret_from_fork) 275ENTRY(ret_from_fork)
276 CFI_DEFAULT_STACK 276 CFI_DEFAULT_STACK
277 push kernel_eflags(%rip) 277 push kernel_eflags(%rip)
278 CFI_ADJUST_CFA_OFFSET 4 278 CFI_ADJUST_CFA_OFFSET 8
279 popf # reset kernel eflags 279 popf # reset kernel eflags
280 CFI_ADJUST_CFA_OFFSET -4 280 CFI_ADJUST_CFA_OFFSET -8
281 call schedule_tail 281 call schedule_tail
282 GET_THREAD_INFO(%rcx) 282 GET_THREAD_INFO(%rcx)
283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index 50189af14b85..849e5cd485b8 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,93 @@
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/apicdef.h> 41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h> 42#include <mach_mpparse.h>
44 43
45/* 44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112#endif
113
114struct mip_reg {
115 unsigned long long off_0;
116 unsigned long long off_8;
117 unsigned long long off_10;
118 unsigned long long off_18;
119 unsigned long long off_20;
120 unsigned long long off_28;
121 unsigned long long off_30;
122 unsigned long long off_38;
123};
124
125#define MIP_SW_APIC 0x1020b
126#define MIP_FUNC(VALUE) (VALUE & 0xff)
127
128/*
46 * ES7000 Globals 129 * ES7000 Globals
47 */ 130 */
48 131
@@ -72,7 +155,7 @@ es7000_rename_gsi(int ioapic, int gsi)
72 base += nr_ioapic_registers[i]; 155 base += nr_ioapic_registers[i];
73 } 156 }
74 157
75 if (!ioapic && (gsi < 16)) 158 if (!ioapic && (gsi < 16))
76 gsi += base; 159 gsi += base;
77 return gsi; 160 return gsi;
78} 161}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,86 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (max_physical_apicid < 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 786548a62d38..9eca5ba7a6b1 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,21 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
183 return 1;
184#endif
185
186 return 0;
187}
133 188
134static cpumask_t physflat_target_cpus(void) 189static cpumask_t physflat_target_cpus(void)
135{ 190{
@@ -176,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 231
177struct genapic apic_physflat = { 232struct genapic apic_physflat = {
178 .name = "physical flat", 233 .name = "physical flat",
234 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 235 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 236 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 237 .target_cpus = physflat_target_cpus,
@@ -185,6 +241,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 241 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 242 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 243 .send_IPI_mask = physflat_send_IPI_mask,
244 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 246 .phys_pkg_id = phys_pkg_id,
247 .get_apic_id = get_apic_id,
248 .set_apic_id = set_apic_id,
249 .apic_id_mask = (0xFFu<<24),
190}; 250};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..e4bf2cc0d743
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..8f1343df2627
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2cfcbded888a..ae2ffc8a400c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,12 +12,12 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
@@ -26,6 +26,36 @@
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28 28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
58
29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
31 61
@@ -123,6 +153,10 @@ static int uv_apic_id_registered(void)
123 return 1; 153 return 1;
124} 154}
125 155
156static void uv_init_apic_ldr(void)
157{
158}
159
126static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
127{ 161{
128 int cpu; 162 int cpu;
@@ -138,9 +172,34 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
138 return BAD_APICID; 172 return BAD_APICID;
139} 173}
140 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
141static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
142{ 201{
143 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
144} 203}
145 204
146#ifdef ZZZ /* Needs x2apic patch */ 205#ifdef ZZZ /* Needs x2apic patch */
@@ -152,17 +211,22 @@ static void uv_send_IPI_self(int vector)
152 211
153struct genapic apic_x2apic_uv_x = { 212struct genapic apic_x2apic_uv_x = {
154 .name = "UV large system", 213 .name = "UV large system",
214 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
155 .int_delivery_mode = dest_Fixed, 215 .int_delivery_mode = dest_Fixed,
156 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 216 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
157 .target_cpus = uv_target_cpus, 217 .target_cpus = uv_target_cpus,
158 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 218 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */
159 .apic_id_registered = uv_apic_id_registered, 219 .apic_id_registered = uv_apic_id_registered,
220 .init_apic_ldr = uv_init_apic_ldr,
160 .send_IPI_all = uv_send_IPI_all, 221 .send_IPI_all = uv_send_IPI_all,
161 .send_IPI_allbutself = uv_send_IPI_allbutself, 222 .send_IPI_allbutself = uv_send_IPI_allbutself,
162 .send_IPI_mask = uv_send_IPI_mask, 223 .send_IPI_mask = uv_send_IPI_mask,
163 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 224 /* ZZZ.send_IPI_self = uv_send_IPI_self, */
164 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 225 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
165 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 226 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
227 .get_apic_id = get_apic_id,
228 .set_apic_id = set_apic_id,
229 .apic_id_mask = (0xFFFFFFFFu),
166}; 230};
167 231
168static __cpuinit void set_x2apic_extra_bits(int pnode) 232static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -222,7 +286,7 @@ static __init void map_low_mmrs(void)
222 286
223enum map_type {map_wb, map_uc}; 287enum map_type {map_wb, map_uc};
224 288
225static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 289static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
226{ 290{
227 unsigned long bytes, paddr; 291 unsigned long bytes, paddr;
228 292
@@ -293,7 +357,9 @@ static __init void uv_rtc_init(void)
293 sn_rtc_cycles_per_second = ticks_per_sec; 357 sn_rtc_cycles_per_second = ticks_per_sec;
294} 358}
295 359
296static __init void uv_system_init(void) 360static bool uv_system_inited;
361
362void __init uv_system_init(void)
297{ 363{
298 union uvh_si_addr_map_config_u m_n_config; 364 union uvh_si_addr_map_config_u m_n_config;
299 union uvh_node_id_u node_id; 365 union uvh_node_id_u node_id;
@@ -383,6 +449,7 @@ static __init void uv_system_init(void)
383 map_mmr_high(max_pnode); 449 map_mmr_high(max_pnode);
384 map_config_high(max_pnode); 450 map_config_high(max_pnode);
385 map_mmioh_high(max_pnode); 451 map_mmioh_high(max_pnode);
452 uv_system_inited = true;
386} 453}
387 454
388/* 455/*
@@ -391,11 +458,12 @@ static __init void uv_system_init(void)
391 */ 458 */
392void __cpuinit uv_cpu_init(void) 459void __cpuinit uv_cpu_init(void)
393{ 460{
394 if (!uv_node_to_blade) 461 BUG_ON(!uv_system_inited);
395 uv_system_init();
396 462
397 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 463 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
398 464
399 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) 465 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
400 set_x2apic_extra_bits(uv_hub_info->pnode); 466 set_x2apic_extra_bits(uv_hub_info->pnode);
401} 467}
468
469
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1b318e903bf6..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -88,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
90 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
91 92
92 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
93 clear_bss(); 94 clear_bss();
@@ -107,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
107 } 108 }
108 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
109 110
110 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
111 113
112 x86_64_init_pda(); 114 x86_64_init_pda();
113 115
114 early_printk("Kernel really alive\n");
115
116 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
117} 117}
118 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -456,9 +452,6 @@ is386: movl $2,%ecx # set MP
4561: 4521:
457#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
458 jmp *(initial_code) 454 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 455
463/* 456/*
464 * We depend on ET to be correct. This checks for 287/387. 457 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +594,11 @@ ignore_int:
601#endif 594#endif
602 iret 595 iret
603 596
597.section .cpuinit.data,"wa"
598.align 4
599ENTRY(initial_code)
600 .long i386_start_kernel
601
604.section .text 602.section .text
605/* 603/*
606 * Real beginning of normal "text" segment 604 * Real beginning of normal "text" segment
@@ -632,19 +630,19 @@ ENTRY(empty_zero_page)
632 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
633 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
634ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
635 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
636# if KPMDS == 3 634# if KPMDS == 3
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
638 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
639 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
640# elif KPMDS == 2 638# elif KPMDS == 2
641 .long 0,0 639 .long 0,0
642 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
643 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
644# elif KPMDS == 1 642# elif KPMDS == 1
645 .long 0,0 643 .long 0,0
646 .long 0,0 644 .long 0,0
647 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
648# else 646# else
649# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
650# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad2b15a1334d..73deaffadd03 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 212 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 213 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 214 hpet_clockevent.min_delta_ns = 5000;
215 215
216 /* 216 /*
217 * Start hpet with the boot cpu mask and make it 217 * Start hpet with the boot cpu mask and make it
@@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
270} 270}
271 271
272static int hpet_legacy_next_event(unsigned long delta, 272static int hpet_legacy_next_event(unsigned long delta,
273 struct clock_event_device *evt) 273 struct clock_event_device *evt)
274{ 274{
275 unsigned long cnt; 275 u32 cnt;
276 276
277 cnt = hpet_readl(HPET_COUNTER); 277 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 278 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 279 hpet_writel(cnt, HPET_T0_CMP);
280 280
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 281 /*
282 * We need to read back the CMP register to make sure that
283 * what we wrote hit the chip before we compare it to the
284 * counter.
285 */
286 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
287
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
282} 289}
283 290
284/* 291/*
@@ -359,6 +366,7 @@ static int hpet_clocksource_register(void)
359int __init hpet_enable(void) 366int __init hpet_enable(void)
360{ 367{
361 unsigned long id; 368 unsigned long id;
369 int i;
362 370
363 if (!is_hpet_capable()) 371 if (!is_hpet_capable())
364 return 0; 372 return 0;
@@ -369,6 +377,29 @@ int __init hpet_enable(void)
369 * Read the period and check for a sane value: 377 * Read the period and check for a sane value:
370 */ 378 */
371 hpet_period = hpet_readl(HPET_PERIOD); 379 hpet_period = hpet_readl(HPET_PERIOD);
380
381 /*
382 * AMD SB700 based systems with spread spectrum enabled use a
383 * SMM based HPET emulation to provide proper frequency
384 * setting. The SMM code is initialized with the first HPET
385 * register access and takes some time to complete. During
386 * this time the config register reads 0xffffffff. We check
387 * for max. 1000 loops whether the config register reads a non
388 * 0xffffffff value to make sure that HPET is up and running
389 * before we go further. A counting loop is safe, as the HPET
390 * access takes thousands of CPU cycles. On non SB700 based
391 * machines this check is only done once and has no side
392 * effects.
393 */
394 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
395 if (i == 1000) {
396 printk(KERN_WARNING
397 "HPET config register value = 0xFFFFFFFF. "
398 "Disabling HPET\n");
399 goto out_nohpet;
400 }
401 }
402
372 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 403 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
373 goto out_nohpet; 404 goto out_nohpet;
374 405
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d9204..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
282 282
283device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
284 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
285void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
286{ 310{
287 unsigned long flags; 311 unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index de9aa0e3a9c5..e710289f673e 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -46,10 +46,13 @@
46#include <asm/nmi.h> 46#include <asm/nmi.h>
47#include <asm/msidef.h> 47#include <asm/msidef.h>
48#include <asm/hypertransport.h> 48#include <asm/hypertransport.h>
49#include <asm/setup.h>
49 50
50#include <mach_apic.h> 51#include <mach_apic.h>
51#include <mach_apicdef.h> 52#include <mach_apicdef.h>
52 53
54#define __apicdebuginit(type) static type __init
55
53int (*ioapic_renumber_irq)(int ioapic, int irq); 56int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count; 57atomic_t irq_mis_count;
55 58
@@ -57,7 +60,7 @@ atomic_t irq_mis_count;
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 60static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58 61
59static DEFINE_SPINLOCK(ioapic_lock); 62static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock); 63DEFINE_SPINLOCK(vector_lock);
61 64
62int timer_through_8259 __initdata; 65int timer_through_8259 __initdata;
63 66
@@ -1209,10 +1212,6 @@ static int assign_irq_vector(int irq)
1209 return vector; 1212 return vector;
1210} 1213}
1211 1214
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1216static struct irq_chip ioapic_chip; 1215static struct irq_chip ioapic_chip;
1217 1216
1218#define IOAPIC_AUTO -1 1217#define IOAPIC_AUTO -1
@@ -1345,7 +1344,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1345 ioapic_write_entry(apic, pin, entry); 1344 ioapic_write_entry(apic, pin, entry);
1346} 1345}
1347 1346
1348void __init print_IO_APIC(void) 1347
1348__apicdebuginit(void) print_IO_APIC(void)
1349{ 1349{
1350 int apic, i; 1350 int apic, i;
1351 union IO_APIC_reg_00 reg_00; 1351 union IO_APIC_reg_00 reg_00;
@@ -1460,9 +1460,7 @@ void __init print_IO_APIC(void)
1460 return; 1460 return;
1461} 1461}
1462 1462
1463#if 0 1463__apicdebuginit(void) print_APIC_bitfield(int base)
1464
1465static void print_APIC_bitfield(int base)
1466{ 1464{
1467 unsigned int v; 1465 unsigned int v;
1468 int i, j; 1466 int i, j;
@@ -1483,9 +1481,10 @@ static void print_APIC_bitfield(int base)
1483 } 1481 }
1484} 1482}
1485 1483
1486void /*__init*/ print_local_APIC(void *dummy) 1484__apicdebuginit(void) print_local_APIC(void *dummy)
1487{ 1485{
1488 unsigned int v, ver, maxlvt; 1486 unsigned int v, ver, maxlvt;
1487 u64 icr;
1489 1488
1490 if (apic_verbosity == APIC_QUIET) 1489 if (apic_verbosity == APIC_QUIET)
1491 return; 1490 return;
@@ -1494,7 +1493,7 @@ void /*__init*/ print_local_APIC(void *dummy)
1494 smp_processor_id(), hard_smp_processor_id()); 1493 smp_processor_id(), hard_smp_processor_id());
1495 v = apic_read(APIC_ID); 1494 v = apic_read(APIC_ID);
1496 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, 1495 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1497 GET_APIC_ID(read_apic_id())); 1496 GET_APIC_ID(v));
1498 v = apic_read(APIC_LVR); 1497 v = apic_read(APIC_LVR);
1499 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1498 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1500 ver = GET_APIC_VERSION(v); 1499 ver = GET_APIC_VERSION(v);
@@ -1536,10 +1535,9 @@ void /*__init*/ print_local_APIC(void *dummy)
1536 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1535 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1537 } 1536 }
1538 1537
1539 v = apic_read(APIC_ICR); 1538 icr = apic_icr_read();
1540 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1539 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
1541 v = apic_read(APIC_ICR2); 1540 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
1542 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1543 1541
1544 v = apic_read(APIC_LVTT); 1542 v = apic_read(APIC_LVTT);
1545 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1543 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1567,12 +1565,12 @@ void /*__init*/ print_local_APIC(void *dummy)
1567 printk("\n"); 1565 printk("\n");
1568} 1566}
1569 1567
1570void print_all_local_APICs(void) 1568__apicdebuginit(void) print_all_local_APICs(void)
1571{ 1569{
1572 on_each_cpu(print_local_APIC, NULL, 1); 1570 on_each_cpu(print_local_APIC, NULL, 1);
1573} 1571}
1574 1572
1575void /*__init*/ print_PIC(void) 1573__apicdebuginit(void) print_PIC(void)
1576{ 1574{
1577 unsigned int v; 1575 unsigned int v;
1578 unsigned long flags; 1576 unsigned long flags;
@@ -1604,7 +1602,17 @@ void /*__init*/ print_PIC(void)
1604 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1602 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1605} 1603}
1606 1604
1607#endif /* 0 */ 1605__apicdebuginit(int) print_all_ICs(void)
1606{
1607 print_PIC();
1608 print_all_local_APICs();
1609 print_IO_APIC();
1610
1611 return 0;
1612}
1613
1614fs_initcall(print_all_ICs);
1615
1608 1616
1609static void __init enable_IO_APIC(void) 1617static void __init enable_IO_APIC(void)
1610{ 1618{
@@ -1702,8 +1710,7 @@ void disable_IO_APIC(void)
1702 entry.dest_mode = 0; /* Physical */ 1710 entry.dest_mode = 0; /* Physical */
1703 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1711 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1704 entry.vector = 0; 1712 entry.vector = 0;
1705 entry.dest.physical.physical_dest = 1713 entry.dest.physical.physical_dest = read_apic_id();
1706 GET_APIC_ID(read_apic_id());
1707 1714
1708 /* 1715 /*
1709 * Add it to the IO-APIC irq-routing table: 1716 * Add it to the IO-APIC irq-routing table:
@@ -1729,10 +1736,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
1729 unsigned char old_id; 1736 unsigned char old_id;
1730 unsigned long flags; 1737 unsigned long flags;
1731 1738
1732#ifdef CONFIG_X86_NUMAQ 1739 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1733 if (found_numaq)
1734 return; 1740 return;
1735#endif
1736 1741
1737 /* 1742 /*
1738 * Don't check I/O APIC IDs for xAPIC systems. They have 1743 * Don't check I/O APIC IDs for xAPIC systems. They have
@@ -2333,8 +2338,6 @@ void __init setup_IO_APIC(void)
2333 setup_IO_APIC_irqs(); 2338 setup_IO_APIC_irqs();
2334 init_IO_APIC_traps(); 2339 init_IO_APIC_traps();
2335 check_timer(); 2340 check_timer();
2336 if (!acpi_ioapic)
2337 print_IO_APIC();
2338} 2341}
2339 2342
2340/* 2343/*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 8269434d1707..02063ae042f7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -37,6 +37,7 @@
37#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
38#endif 38#endif
39#include <linux/bootmem.h> 39#include <linux/bootmem.h>
40#include <linux/dmar.h>
40 41
41#include <asm/idle.h> 42#include <asm/idle.h>
42#include <asm/io.h> 43#include <asm/io.h>
@@ -49,10 +50,13 @@
49#include <asm/nmi.h> 50#include <asm/nmi.h>
50#include <asm/msidef.h> 51#include <asm/msidef.h>
51#include <asm/hypertransport.h> 52#include <asm/hypertransport.h>
53#include <asm/irq_remapping.h>
52 54
53#include <mach_ipi.h> 55#include <mach_ipi.h>
54#include <mach_apic.h> 56#include <mach_apic.h>
55 57
58#define __apicdebuginit(type) static type __init
59
56struct irq_cfg { 60struct irq_cfg {
57 cpumask_t domain; 61 cpumask_t domain;
58 cpumask_t old_domain; 62 cpumask_t old_domain;
@@ -87,8 +91,6 @@ int first_system_vector = 0xfe;
87 91
88char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 92char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
89 93
90#define __apicdebuginit __init
91
92int sis_apic_bug; /* not actually supported, dummy for compile */ 94int sis_apic_bug; /* not actually supported, dummy for compile */
93 95
94static int no_timer_check; 96static int no_timer_check;
@@ -101,13 +103,16 @@ int timer_through_8259 __initdata;
101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 103static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
102 104
103static DEFINE_SPINLOCK(ioapic_lock); 105static DEFINE_SPINLOCK(ioapic_lock);
104DEFINE_SPINLOCK(vector_lock); 106static DEFINE_SPINLOCK(vector_lock);
105 107
106/* 108/*
107 * # of IRQ routing registers 109 * # of IRQ routing registers
108 */ 110 */
109int nr_ioapic_registers[MAX_IO_APICS]; 111int nr_ioapic_registers[MAX_IO_APICS];
110 112
113/* I/O APIC RTE contents at the OS boot up */
114struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
115
111/* I/O APIC entries */ 116/* I/O APIC entries */
112struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 117struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
113int nr_ioapics; 118int nr_ioapics;
@@ -303,7 +308,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
303 pin = entry->pin; 308 pin = entry->pin;
304 if (pin == -1) 309 if (pin == -1)
305 break; 310 break;
306 io_apic_write(apic, 0x11 + pin*2, dest); 311 /*
312 * With interrupt-remapping, destination information comes
313 * from interrupt-remapping table entry.
314 */
315 if (!irq_remapped(irq))
316 io_apic_write(apic, 0x11 + pin*2, dest);
307 reg = io_apic_read(apic, 0x10 + pin*2); 317 reg = io_apic_read(apic, 0x10 + pin*2);
308 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 318 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
309 reg |= vector; 319 reg |= vector;
@@ -440,6 +450,69 @@ static void clear_IO_APIC (void)
440 clear_IO_APIC_pin(apic, pin); 450 clear_IO_APIC_pin(apic, pin);
441} 451}
442 452
453/*
454 * Saves and masks all the unmasked IO-APIC RTE's
455 */
456int save_mask_IO_APIC_setup(void)
457{
458 union IO_APIC_reg_01 reg_01;
459 unsigned long flags;
460 int apic, pin;
461
462 /*
463 * The number of IO-APIC IRQ registers (== #pins):
464 */
465 for (apic = 0; apic < nr_ioapics; apic++) {
466 spin_lock_irqsave(&ioapic_lock, flags);
467 reg_01.raw = io_apic_read(apic, 1);
468 spin_unlock_irqrestore(&ioapic_lock, flags);
469 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
470 }
471
472 for (apic = 0; apic < nr_ioapics; apic++) {
473 early_ioapic_entries[apic] =
474 kzalloc(sizeof(struct IO_APIC_route_entry) *
475 nr_ioapic_registers[apic], GFP_KERNEL);
476 if (!early_ioapic_entries[apic])
477 return -ENOMEM;
478 }
479
480 for (apic = 0; apic < nr_ioapics; apic++)
481 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
482 struct IO_APIC_route_entry entry;
483
484 entry = early_ioapic_entries[apic][pin] =
485 ioapic_read_entry(apic, pin);
486 if (!entry.mask) {
487 entry.mask = 1;
488 ioapic_write_entry(apic, pin, entry);
489 }
490 }
491 return 0;
492}
493
494void restore_IO_APIC_setup(void)
495{
496 int apic, pin;
497
498 for (apic = 0; apic < nr_ioapics; apic++)
499 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
500 ioapic_write_entry(apic, pin,
501 early_ioapic_entries[apic][pin]);
502}
503
504void reinit_intr_remapped_IO_APIC(int intr_remapping)
505{
506 /*
507 * for now plain restore of previous settings.
508 * TBD: In the case of OS enabling interrupt-remapping,
509 * IO-APIC RTE's need to be setup to point to interrupt-remapping
510 * table entries. for now, do a plain restore, and wait for
511 * the setup_IO_APIC_irqs() to do proper initialization.
512 */
513 restore_IO_APIC_setup();
514}
515
443int skip_ioapic_setup; 516int skip_ioapic_setup;
444int ioapic_force; 517int ioapic_force;
445 518
@@ -697,6 +770,19 @@ static int pin_2_irq(int idx, int apic, int pin)
697 return irq; 770 return irq;
698} 771}
699 772
773void lock_vector_lock(void)
774{
775 /* Used to the online set of cpus does not change
776 * during assign_irq_vector.
777 */
778 spin_lock(&vector_lock);
779}
780
781void unlock_vector_lock(void)
782{
783 spin_unlock(&vector_lock);
784}
785
700static int __assign_irq_vector(int irq, cpumask_t mask) 786static int __assign_irq_vector(int irq, cpumask_t mask)
701{ 787{
702 /* 788 /*
@@ -802,7 +888,7 @@ static void __clear_irq_vector(int irq)
802 cpus_clear(cfg->domain); 888 cpus_clear(cfg->domain);
803} 889}
804 890
805static void __setup_vector_irq(int cpu) 891void __setup_vector_irq(int cpu)
806{ 892{
807 /* Initialize vector_irq on a new cpu */ 893 /* Initialize vector_irq on a new cpu */
808 /* This function must be called with vector_lock held */ 894 /* This function must be called with vector_lock held */
@@ -825,27 +911,99 @@ static void __setup_vector_irq(int cpu)
825 } 911 }
826} 912}
827 913
828void setup_vector_irq(int cpu)
829{
830 spin_lock(&vector_lock);
831 __setup_vector_irq(smp_processor_id());
832 spin_unlock(&vector_lock);
833}
834
835
836static struct irq_chip ioapic_chip; 914static struct irq_chip ioapic_chip;
915#ifdef CONFIG_INTR_REMAP
916static struct irq_chip ir_ioapic_chip;
917#endif
837 918
838static void ioapic_register_intr(int irq, unsigned long trigger) 919static void ioapic_register_intr(int irq, unsigned long trigger)
839{ 920{
840 if (trigger) { 921 if (trigger)
841 irq_desc[irq].status |= IRQ_LEVEL; 922 irq_desc[irq].status |= IRQ_LEVEL;
842 set_irq_chip_and_handler_name(irq, &ioapic_chip, 923 else
843 handle_fasteoi_irq, "fasteoi");
844 } else {
845 irq_desc[irq].status &= ~IRQ_LEVEL; 924 irq_desc[irq].status &= ~IRQ_LEVEL;
925
926#ifdef CONFIG_INTR_REMAP
927 if (irq_remapped(irq)) {
928 irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
929 if (trigger)
930 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
931 handle_fasteoi_irq,
932 "fasteoi");
933 else
934 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
935 handle_edge_irq, "edge");
936 return;
937 }
938#endif
939 if (trigger)
940 set_irq_chip_and_handler_name(irq, &ioapic_chip,
941 handle_fasteoi_irq,
942 "fasteoi");
943 else
846 set_irq_chip_and_handler_name(irq, &ioapic_chip, 944 set_irq_chip_and_handler_name(irq, &ioapic_chip,
847 handle_edge_irq, "edge"); 945 handle_edge_irq, "edge");
946}
947
948static int setup_ioapic_entry(int apic, int irq,
949 struct IO_APIC_route_entry *entry,
950 unsigned int destination, int trigger,
951 int polarity, int vector)
952{
953 /*
954 * add it to the IO-APIC irq-routing table:
955 */
956 memset(entry,0,sizeof(*entry));
957
958#ifdef CONFIG_INTR_REMAP
959 if (intr_remapping_enabled) {
960 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
961 struct irte irte;
962 struct IR_IO_APIC_route_entry *ir_entry =
963 (struct IR_IO_APIC_route_entry *) entry;
964 int index;
965
966 if (!iommu)
967 panic("No mapping iommu for ioapic %d\n", apic);
968
969 index = alloc_irte(iommu, irq, 1);
970 if (index < 0)
971 panic("Failed to allocate IRTE for ioapic %d\n", apic);
972
973 memset(&irte, 0, sizeof(irte));
974
975 irte.present = 1;
976 irte.dst_mode = INT_DEST_MODE;
977 irte.trigger_mode = trigger;
978 irte.dlvry_mode = INT_DELIVERY_MODE;
979 irte.vector = vector;
980 irte.dest_id = IRTE_DEST(destination);
981
982 modify_irte(irq, &irte);
983
984 ir_entry->index2 = (index >> 15) & 0x1;
985 ir_entry->zero = 0;
986 ir_entry->format = 1;
987 ir_entry->index = (index & 0x7fff);
988 } else
989#endif
990 {
991 entry->delivery_mode = INT_DELIVERY_MODE;
992 entry->dest_mode = INT_DEST_MODE;
993 entry->dest = destination;
848 } 994 }
995
996 entry->mask = 0; /* enable IRQ */
997 entry->trigger = trigger;
998 entry->polarity = polarity;
999 entry->vector = vector;
1000
1001 /* Mask level triggered irqs.
1002 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1003 */
1004 if (trigger)
1005 entry->mask = 1;
1006 return 0;
849} 1007}
850 1008
851static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1009static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
@@ -870,24 +1028,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
870 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1028 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
871 irq, trigger, polarity); 1029 irq, trigger, polarity);
872 1030
873 /*
874 * add it to the IO-APIC irq-routing table:
875 */
876 memset(&entry,0,sizeof(entry));
877
878 entry.delivery_mode = INT_DELIVERY_MODE;
879 entry.dest_mode = INT_DEST_MODE;
880 entry.dest = cpu_mask_to_apicid(mask);
881 entry.mask = 0; /* enable IRQ */
882 entry.trigger = trigger;
883 entry.polarity = polarity;
884 entry.vector = cfg->vector;
885 1031
886 /* Mask level triggered irqs. 1032 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
887 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1033 cpu_mask_to_apicid(mask), trigger, polarity,
888 */ 1034 cfg->vector)) {
889 if (trigger) 1035 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
890 entry.mask = 1; 1036 mp_ioapics[apic].mp_apicid, pin);
1037 __clear_irq_vector(irq);
1038 return;
1039 }
891 1040
892 ioapic_register_intr(irq, trigger); 1041 ioapic_register_intr(irq, trigger);
893 if (irq < 16) 1042 if (irq < 16)
@@ -939,6 +1088,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
939{ 1088{
940 struct IO_APIC_route_entry entry; 1089 struct IO_APIC_route_entry entry;
941 1090
1091 if (intr_remapping_enabled)
1092 return;
1093
942 memset(&entry, 0, sizeof(entry)); 1094 memset(&entry, 0, sizeof(entry));
943 1095
944 /* 1096 /*
@@ -965,7 +1117,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
965 ioapic_write_entry(apic, pin, entry); 1117 ioapic_write_entry(apic, pin, entry);
966} 1118}
967 1119
968void __apicdebuginit print_IO_APIC(void) 1120
1121__apicdebuginit(void) print_IO_APIC(void)
969{ 1122{
970 int apic, i; 1123 int apic, i;
971 union IO_APIC_reg_00 reg_00; 1124 union IO_APIC_reg_00 reg_00;
@@ -1059,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void)
1059 return; 1212 return;
1060} 1213}
1061 1214
1062#if 0 1215__apicdebuginit(void) print_APIC_bitfield(int base)
1063
1064static __apicdebuginit void print_APIC_bitfield (int base)
1065{ 1216{
1066 unsigned int v; 1217 unsigned int v;
1067 int i, j; 1218 int i, j;
@@ -1082,9 +1233,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1082 } 1233 }
1083} 1234}
1084 1235
1085void __apicdebuginit print_local_APIC(void * dummy) 1236__apicdebuginit(void) print_local_APIC(void *dummy)
1086{ 1237{
1087 unsigned int v, ver, maxlvt; 1238 unsigned int v, ver, maxlvt;
1239 unsigned long icr;
1088 1240
1089 if (apic_verbosity == APIC_QUIET) 1241 if (apic_verbosity == APIC_QUIET)
1090 return; 1242 return;
@@ -1092,7 +1244,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1092 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1244 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1093 smp_processor_id(), hard_smp_processor_id()); 1245 smp_processor_id(), hard_smp_processor_id());
1094 v = apic_read(APIC_ID); 1246 v = apic_read(APIC_ID);
1095 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1247 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1096 v = apic_read(APIC_LVR); 1248 v = apic_read(APIC_LVR);
1097 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1249 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1098 ver = GET_APIC_VERSION(v); 1250 ver = GET_APIC_VERSION(v);
@@ -1128,10 +1280,9 @@ void __apicdebuginit print_local_APIC(void * dummy)
1128 v = apic_read(APIC_ESR); 1280 v = apic_read(APIC_ESR);
1129 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1130 1282
1131 v = apic_read(APIC_ICR); 1283 icr = apic_icr_read();
1132 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1133 v = apic_read(APIC_ICR2); 1285 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1134 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1135 1286
1136 v = apic_read(APIC_LVTT); 1287 v = apic_read(APIC_LVTT);
1137 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1288 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1159,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy)
1159 printk("\n"); 1310 printk("\n");
1160} 1311}
1161 1312
1162void print_all_local_APICs (void) 1313__apicdebuginit(void) print_all_local_APICs(void)
1163{ 1314{
1164 on_each_cpu(print_local_APIC, NULL, 1); 1315 on_each_cpu(print_local_APIC, NULL, 1);
1165} 1316}
1166 1317
1167void __apicdebuginit print_PIC(void) 1318__apicdebuginit(void) print_PIC(void)
1168{ 1319{
1169 unsigned int v; 1320 unsigned int v;
1170 unsigned long flags; 1321 unsigned long flags;
@@ -1196,7 +1347,17 @@ void __apicdebuginit print_PIC(void)
1196 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1347 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1197} 1348}
1198 1349
1199#endif /* 0 */ 1350__apicdebuginit(int) print_all_ICs(void)
1351{
1352 print_PIC();
1353 print_all_local_APICs();
1354 print_IO_APIC();
1355
1356 return 0;
1357}
1358
1359fs_initcall(print_all_ICs);
1360
1200 1361
1201void __init enable_IO_APIC(void) 1362void __init enable_IO_APIC(void)
1202{ 1363{
@@ -1286,7 +1447,7 @@ void disable_IO_APIC(void)
1286 entry.dest_mode = 0; /* Physical */ 1447 entry.dest_mode = 0; /* Physical */
1287 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1448 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1288 entry.vector = 0; 1449 entry.vector = 0;
1289 entry.dest = GET_APIC_ID(read_apic_id()); 1450 entry.dest = read_apic_id();
1290 1451
1291 /* 1452 /*
1292 * Add it to the IO-APIC irq-routing table: 1453 * Add it to the IO-APIC irq-routing table:
@@ -1392,6 +1553,147 @@ static int ioapic_retrigger_irq(unsigned int irq)
1392 */ 1553 */
1393 1554
1394#ifdef CONFIG_SMP 1555#ifdef CONFIG_SMP
1556
1557#ifdef CONFIG_INTR_REMAP
1558static void ir_irq_migration(struct work_struct *work);
1559
1560static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
1561
1562/*
1563 * Migrate the IO-APIC irq in the presence of intr-remapping.
1564 *
1565 * For edge triggered, irq migration is a simple atomic update(of vector
1566 * and cpu destination) of IRTE and flush the hardware cache.
1567 *
1568 * For level triggered, we need to modify the io-apic RTE aswell with the update
1569 * vector information, along with modifying IRTE with vector and destination.
1570 * So irq migration for level triggered is little bit more complex compared to
1571 * edge triggered migration. But the good news is, we use the same algorithm
1572 * for level triggered migration as we have today, only difference being,
1573 * we now initiate the irq migration from process context instead of the
1574 * interrupt context.
1575 *
1576 * In future, when we do a directed EOI (combined with cpu EOI broadcast
1577 * suppression) to the IO-APIC, level triggered irq migration will also be
1578 * as simple as edge triggered migration and we can do the irq migration
1579 * with a simple atomic update to IO-APIC RTE.
1580 */
1581static void migrate_ioapic_irq(int irq, cpumask_t mask)
1582{
1583 struct irq_cfg *cfg = irq_cfg + irq;
1584 struct irq_desc *desc = irq_desc + irq;
1585 cpumask_t tmp, cleanup_mask;
1586 struct irte irte;
1587 int modify_ioapic_rte = desc->status & IRQ_LEVEL;
1588 unsigned int dest;
1589 unsigned long flags;
1590
1591 cpus_and(tmp, mask, cpu_online_map);
1592 if (cpus_empty(tmp))
1593 return;
1594
1595 if (get_irte(irq, &irte))
1596 return;
1597
1598 if (assign_irq_vector(irq, mask))
1599 return;
1600
1601 cpus_and(tmp, cfg->domain, mask);
1602 dest = cpu_mask_to_apicid(tmp);
1603
1604 if (modify_ioapic_rte) {
1605 spin_lock_irqsave(&ioapic_lock, flags);
1606 __target_IO_APIC_irq(irq, dest, cfg->vector);
1607 spin_unlock_irqrestore(&ioapic_lock, flags);
1608 }
1609
1610 irte.vector = cfg->vector;
1611 irte.dest_id = IRTE_DEST(dest);
1612
1613 /*
1614 * Modified the IRTE and flushes the Interrupt entry cache.
1615 */
1616 modify_irte(irq, &irte);
1617
1618 if (cfg->move_in_progress) {
1619 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1620 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
1621 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
1622 cfg->move_in_progress = 0;
1623 }
1624
1625 irq_desc[irq].affinity = mask;
1626}
1627
1628static int migrate_irq_remapped_level(int irq)
1629{
1630 int ret = -1;
1631
1632 mask_IO_APIC_irq(irq);
1633
1634 if (io_apic_level_ack_pending(irq)) {
1635 /*
1636 * Interrupt in progress. Migrating irq now will change the
1637 * vector information in the IO-APIC RTE and that will confuse
1638 * the EOI broadcast performed by cpu.
1639 * So, delay the irq migration to the next instance.
1640 */
1641 schedule_delayed_work(&ir_migration_work, 1);
1642 goto unmask;
1643 }
1644
1645 /* everthing is clear. we have right of way */
1646 migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
1647
1648 ret = 0;
1649 irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
1650 cpus_clear(irq_desc[irq].pending_mask);
1651
1652unmask:
1653 unmask_IO_APIC_irq(irq);
1654 return ret;
1655}
1656
1657static void ir_irq_migration(struct work_struct *work)
1658{
1659 int irq;
1660
1661 for (irq = 0; irq < NR_IRQS; irq++) {
1662 struct irq_desc *desc = irq_desc + irq;
1663 if (desc->status & IRQ_MOVE_PENDING) {
1664 unsigned long flags;
1665
1666 spin_lock_irqsave(&desc->lock, flags);
1667 if (!desc->chip->set_affinity ||
1668 !(desc->status & IRQ_MOVE_PENDING)) {
1669 desc->status &= ~IRQ_MOVE_PENDING;
1670 spin_unlock_irqrestore(&desc->lock, flags);
1671 continue;
1672 }
1673
1674 desc->chip->set_affinity(irq,
1675 irq_desc[irq].pending_mask);
1676 spin_unlock_irqrestore(&desc->lock, flags);
1677 }
1678 }
1679}
1680
1681/*
1682 * Migrates the IRQ destination in the process context.
1683 */
1684static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1685{
1686 if (irq_desc[irq].status & IRQ_LEVEL) {
1687 irq_desc[irq].status |= IRQ_MOVE_PENDING;
1688 irq_desc[irq].pending_mask = mask;
1689 migrate_irq_remapped_level(irq);
1690 return;
1691 }
1692
1693 migrate_ioapic_irq(irq, mask);
1694}
1695#endif
1696
1395asmlinkage void smp_irq_move_cleanup_interrupt(void) 1697asmlinkage void smp_irq_move_cleanup_interrupt(void)
1396{ 1698{
1397 unsigned vector, me; 1699 unsigned vector, me;
@@ -1448,6 +1750,17 @@ static void irq_complete_move(unsigned int irq)
1448#else 1750#else
1449static inline void irq_complete_move(unsigned int irq) {} 1751static inline void irq_complete_move(unsigned int irq) {}
1450#endif 1752#endif
1753#ifdef CONFIG_INTR_REMAP
1754static void ack_x2apic_level(unsigned int irq)
1755{
1756 ack_x2APIC_irq();
1757}
1758
1759static void ack_x2apic_edge(unsigned int irq)
1760{
1761 ack_x2APIC_irq();
1762}
1763#endif
1451 1764
1452static void ack_apic_edge(unsigned int irq) 1765static void ack_apic_edge(unsigned int irq)
1453{ 1766{
@@ -1522,6 +1835,21 @@ static struct irq_chip ioapic_chip __read_mostly = {
1522 .retrigger = ioapic_retrigger_irq, 1835 .retrigger = ioapic_retrigger_irq,
1523}; 1836};
1524 1837
1838#ifdef CONFIG_INTR_REMAP
1839static struct irq_chip ir_ioapic_chip __read_mostly = {
1840 .name = "IR-IO-APIC",
1841 .startup = startup_ioapic_irq,
1842 .mask = mask_IO_APIC_irq,
1843 .unmask = unmask_IO_APIC_irq,
1844 .ack = ack_x2apic_edge,
1845 .eoi = ack_x2apic_level,
1846#ifdef CONFIG_SMP
1847 .set_affinity = set_ir_ioapic_affinity_irq,
1848#endif
1849 .retrigger = ioapic_retrigger_irq,
1850};
1851#endif
1852
1525static inline void init_IO_APIC_traps(void) 1853static inline void init_IO_APIC_traps(void)
1526{ 1854{
1527 int irq; 1855 int irq;
@@ -1707,6 +2035,8 @@ static inline void __init check_timer(void)
1707 * 8259A. 2035 * 8259A.
1708 */ 2036 */
1709 if (pin1 == -1) { 2037 if (pin1 == -1) {
2038 if (intr_remapping_enabled)
2039 panic("BIOS bug: timer not connected to IO-APIC");
1710 pin1 = pin2; 2040 pin1 = pin2;
1711 apic1 = apic2; 2041 apic1 = apic2;
1712 no_pin1 = 1; 2042 no_pin1 = 1;
@@ -1733,6 +2063,8 @@ static inline void __init check_timer(void)
1733 clear_IO_APIC_pin(0, pin1); 2063 clear_IO_APIC_pin(0, pin1);
1734 goto out; 2064 goto out;
1735 } 2065 }
2066 if (intr_remapping_enabled)
2067 panic("timer doesn't work through Interrupt-remapped IO-APIC");
1736 clear_IO_APIC_pin(apic1, pin1); 2068 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 2069 if (!no_pin1)
1738 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2070 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -1849,8 +2181,6 @@ void __init setup_IO_APIC(void)
1849 setup_IO_APIC_irqs(); 2181 setup_IO_APIC_irqs();
1850 init_IO_APIC_traps(); 2182 init_IO_APIC_traps();
1851 check_timer(); 2183 check_timer();
1852 if (!acpi_ioapic)
1853 print_IO_APIC();
1854} 2184}
1855 2185
1856struct sysfs_ioapic_data { 2186struct sysfs_ioapic_data {
@@ -1972,6 +2302,9 @@ void destroy_irq(unsigned int irq)
1972 2302
1973 dynamic_irq_cleanup(irq); 2303 dynamic_irq_cleanup(irq);
1974 2304
2305#ifdef CONFIG_INTR_REMAP
2306 free_irte(irq);
2307#endif
1975 spin_lock_irqsave(&vector_lock, flags); 2308 spin_lock_irqsave(&vector_lock, flags);
1976 __clear_irq_vector(irq); 2309 __clear_irq_vector(irq);
1977 spin_unlock_irqrestore(&vector_lock, flags); 2310 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1990,10 +2323,41 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
1990 2323
1991 tmp = TARGET_CPUS; 2324 tmp = TARGET_CPUS;
1992 err = assign_irq_vector(irq, tmp); 2325 err = assign_irq_vector(irq, tmp);
1993 if (!err) { 2326 if (err)
1994 cpus_and(tmp, cfg->domain, tmp); 2327 return err;
1995 dest = cpu_mask_to_apicid(tmp); 2328
2329 cpus_and(tmp, cfg->domain, tmp);
2330 dest = cpu_mask_to_apicid(tmp);
2331
2332#ifdef CONFIG_INTR_REMAP
2333 if (irq_remapped(irq)) {
2334 struct irte irte;
2335 int ir_index;
2336 u16 sub_handle;
2337
2338 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2339 BUG_ON(ir_index == -1);
1996 2340
2341 memset (&irte, 0, sizeof(irte));
2342
2343 irte.present = 1;
2344 irte.dst_mode = INT_DEST_MODE;
2345 irte.trigger_mode = 0; /* edge */
2346 irte.dlvry_mode = INT_DELIVERY_MODE;
2347 irte.vector = cfg->vector;
2348 irte.dest_id = IRTE_DEST(dest);
2349
2350 modify_irte(irq, &irte);
2351
2352 msg->address_hi = MSI_ADDR_BASE_HI;
2353 msg->data = sub_handle;
2354 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2355 MSI_ADDR_IR_SHV |
2356 MSI_ADDR_IR_INDEX1(ir_index) |
2357 MSI_ADDR_IR_INDEX2(ir_index);
2358 } else
2359#endif
2360 {
1997 msg->address_hi = MSI_ADDR_BASE_HI; 2361 msg->address_hi = MSI_ADDR_BASE_HI;
1998 msg->address_lo = 2362 msg->address_lo =
1999 MSI_ADDR_BASE_LO | 2363 MSI_ADDR_BASE_LO |
@@ -2044,6 +2408,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2044 write_msi_msg(irq, &msg); 2408 write_msi_msg(irq, &msg);
2045 irq_desc[irq].affinity = mask; 2409 irq_desc[irq].affinity = mask;
2046} 2410}
2411
2412#ifdef CONFIG_INTR_REMAP
2413/*
2414 * Migrate the MSI irq to another cpumask. This migration is
2415 * done in the process context using interrupt-remapping hardware.
2416 */
2417static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2418{
2419 struct irq_cfg *cfg = irq_cfg + irq;
2420 unsigned int dest;
2421 cpumask_t tmp, cleanup_mask;
2422 struct irte irte;
2423
2424 cpus_and(tmp, mask, cpu_online_map);
2425 if (cpus_empty(tmp))
2426 return;
2427
2428 if (get_irte(irq, &irte))
2429 return;
2430
2431 if (assign_irq_vector(irq, mask))
2432 return;
2433
2434 cpus_and(tmp, cfg->domain, mask);
2435 dest = cpu_mask_to_apicid(tmp);
2436
2437 irte.vector = cfg->vector;
2438 irte.dest_id = IRTE_DEST(dest);
2439
2440 /*
2441 * atomically update the IRTE with the new destination and vector.
2442 */
2443 modify_irte(irq, &irte);
2444
2445 /*
2446 * After this point, all the interrupts will start arriving
2447 * at the new destination. So, time to cleanup the previous
2448 * vector allocation.
2449 */
2450 if (cfg->move_in_progress) {
2451 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2452 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2453 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2454 cfg->move_in_progress = 0;
2455 }
2456
2457 irq_desc[irq].affinity = mask;
2458}
2459#endif
2047#endif /* CONFIG_SMP */ 2460#endif /* CONFIG_SMP */
2048 2461
2049/* 2462/*
@@ -2061,26 +2474,157 @@ static struct irq_chip msi_chip = {
2061 .retrigger = ioapic_retrigger_irq, 2474 .retrigger = ioapic_retrigger_irq,
2062}; 2475};
2063 2476
2064int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 2477#ifdef CONFIG_INTR_REMAP
2478static struct irq_chip msi_ir_chip = {
2479 .name = "IR-PCI-MSI",
2480 .unmask = unmask_msi_irq,
2481 .mask = mask_msi_irq,
2482 .ack = ack_x2apic_edge,
2483#ifdef CONFIG_SMP
2484 .set_affinity = ir_set_msi_irq_affinity,
2485#endif
2486 .retrigger = ioapic_retrigger_irq,
2487};
2488
2489/*
2490 * Map the PCI dev to the corresponding remapping hardware unit
2491 * and allocate 'nvec' consecutive interrupt-remapping table entries
2492 * in it.
2493 */
2494static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2065{ 2495{
2496 struct intel_iommu *iommu;
2497 int index;
2498
2499 iommu = map_dev_to_ir(dev);
2500 if (!iommu) {
2501 printk(KERN_ERR
2502 "Unable to map PCI %s to iommu\n", pci_name(dev));
2503 return -ENOENT;
2504 }
2505
2506 index = alloc_irte(iommu, irq, nvec);
2507 if (index < 0) {
2508 printk(KERN_ERR
2509 "Unable to allocate %d IRTE for PCI %s\n", nvec,
2510 pci_name(dev));
2511 return -ENOSPC;
2512 }
2513 return index;
2514}
2515#endif
2516
2517static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2518{
2519 int ret;
2066 struct msi_msg msg; 2520 struct msi_msg msg;
2521
2522 ret = msi_compose_msg(dev, irq, &msg);
2523 if (ret < 0)
2524 return ret;
2525
2526 set_irq_msi(irq, desc);
2527 write_msi_msg(irq, &msg);
2528
2529#ifdef CONFIG_INTR_REMAP
2530 if (irq_remapped(irq)) {
2531 struct irq_desc *desc = irq_desc + irq;
2532 /*
2533 * irq migration in process context
2534 */
2535 desc->status |= IRQ_MOVE_PCNTXT;
2536 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
2537 } else
2538#endif
2539 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2540
2541 return 0;
2542}
2543
2544int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2545{
2067 int irq, ret; 2546 int irq, ret;
2547
2068 irq = create_irq(); 2548 irq = create_irq();
2069 if (irq < 0) 2549 if (irq < 0)
2070 return irq; 2550 return irq;
2071 2551
2072 ret = msi_compose_msg(dev, irq, &msg); 2552#ifdef CONFIG_INTR_REMAP
2553 if (!intr_remapping_enabled)
2554 goto no_ir;
2555
2556 ret = msi_alloc_irte(dev, irq, 1);
2557 if (ret < 0)
2558 goto error;
2559no_ir:
2560#endif
2561 ret = setup_msi_irq(dev, desc, irq);
2073 if (ret < 0) { 2562 if (ret < 0) {
2074 destroy_irq(irq); 2563 destroy_irq(irq);
2075 return ret; 2564 return ret;
2076 } 2565 }
2566 return 0;
2077 2567
2078 set_irq_msi(irq, desc); 2568#ifdef CONFIG_INTR_REMAP
2079 write_msi_msg(irq, &msg); 2569error:
2570 destroy_irq(irq);
2571 return ret;
2572#endif
2573}
2574
2575int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
2576{
2577 int irq, ret, sub_handle;
2578 struct msi_desc *desc;
2579#ifdef CONFIG_INTR_REMAP
2580 struct intel_iommu *iommu = 0;
2581 int index = 0;
2582#endif
2080 2583
2081 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 2584 sub_handle = 0;
2585 list_for_each_entry(desc, &dev->msi_list, list) {
2586 irq = create_irq();
2587 if (irq < 0)
2588 return irq;
2589#ifdef CONFIG_INTR_REMAP
2590 if (!intr_remapping_enabled)
2591 goto no_ir;
2082 2592
2593 if (!sub_handle) {
2594 /*
2595 * allocate the consecutive block of IRTE's
2596 * for 'nvec'
2597 */
2598 index = msi_alloc_irte(dev, irq, nvec);
2599 if (index < 0) {
2600 ret = index;
2601 goto error;
2602 }
2603 } else {
2604 iommu = map_dev_to_ir(dev);
2605 if (!iommu) {
2606 ret = -ENOENT;
2607 goto error;
2608 }
2609 /*
2610 * setup the mapping between the irq and the IRTE
2611 * base index, the sub_handle pointing to the
2612 * appropriate interrupt remap table entry.
2613 */
2614 set_irte_irq(irq, iommu, index, sub_handle);
2615 }
2616no_ir:
2617#endif
2618 ret = setup_msi_irq(dev, desc, irq);
2619 if (ret < 0)
2620 goto error;
2621 sub_handle++;
2622 }
2083 return 0; 2623 return 0;
2624
2625error:
2626 destroy_irq(irq);
2627 return ret;
2084} 2628}
2085 2629
2086void arch_teardown_msi_irq(unsigned int irq) 2630void arch_teardown_msi_irq(unsigned int irq)
@@ -2328,6 +2872,10 @@ void __init setup_ioapic_dest(void)
2328 setup_IO_APIC_irq(ioapic, pin, irq, 2872 setup_IO_APIC_irq(ioapic, pin, irq,
2329 irq_trigger(irq_entry), 2873 irq_trigger(irq_entry),
2330 irq_polarity(irq_entry)); 2874 irq_polarity(irq_entry));
2875#ifdef CONFIG_INTR_REMAP
2876 else if (intr_remapping_enabled)
2877 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
2878#endif
2331 else 2879 else
2332 set_ioapic_affinity_irq(irq, TARGET_CPUS); 2880 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2333 } 2881 }
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 1c3a66a67f83..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <asm/syscalls.h>
17 18
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, 20static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 3f7537b669d3..f1c688e46f35 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
20 20
21#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
22#include <mach_apic.h> 22#include <mach_apic.h>
23#include <mach_ipi.h>
24
23/* 25/*
24 * the following functions deal with sending IPIs between CPUs. 26 * the following functions deal with sending IPIs between CPUs.
25 * 27 *
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
147} 149}
148 150
149/* must come after the send_IPI functions above for inlining */ 151/* must come after the send_IPI functions above for inlining */
150#include <mach_ipi.h>
151static int convert_apicid_to_cpu(int apic_id) 152static int convert_apicid_to_cpu(int apic_id)
152{ 153{
153 int i; 154 int i;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc088..b71e02d42f4f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
325 for_each_online_cpu(j) 325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ", 326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count); 327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " function call interrupts\n"); 328 seq_printf(p, " Function call interrupts\n");
329 seq_printf(p, "TLB: "); 329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j) 330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ", 331 seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..f065fe9071b9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
129 seq_printf(p, "CAL: "); 129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j) 130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); 131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n"); 132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: "); 133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j) 134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..9200a1e2752d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -74,6 +74,15 @@ void __init init_ISA_irqs (void)
74 } 74 }
75} 75}
76 76
77/*
78 * IRQ2 is cascade interrupt to second interrupt controller
79 */
80static struct irqaction irq2 = {
81 .handler = no_action,
82 .mask = CPU_MASK_NONE,
83 .name = "cascade",
84};
85
77/* Overridden in paravirt.c */ 86/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 87void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 88
@@ -98,6 +107,46 @@ void __init native_init_IRQ(void)
98 set_intr_gate(vector, interrupt[i]); 107 set_intr_gate(vector, interrupt[i]);
99 } 108 }
100 109
110#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
111 /*
112 * IRQ0 must be given a fixed assignment and initialized,
113 * because it's used before the IO-APIC is set up.
114 */
115 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
116
117 /*
118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
119 * IPI, driven by wakeup.
120 */
121 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
122
123 /* IPI for invalidation */
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for single call function */
130 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
131#endif
132
133#ifdef CONFIG_X86_LOCAL_APIC
134 /* self generated IPI for local APIC timer */
135 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
136
137 /* IPI vectors for APIC spurious and error interrupts */
138 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
139 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
140#endif
141
142#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
143 /* thermal monitor LVT interrupt */
144 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
145#endif
146
147 if (!acpi_ioapic)
148 setup_irq(2, &irq2);
149
101 /* setup after call gates are initialised (usually add in 150 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 151 * the architecture specific gates)
103 */ 152 */
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index f2d43bc75514..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
139 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
141 if (!data) { 141 if (!data) {
142 kfree(node);
142 error = -ENXIO; 143 error = -ENXIO;
143 goto err_dir; 144 goto err_dir;
144 } 145 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 3fee2aa50f3f..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h>
21 22
22#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
23static void flush_ldt(void *current_mm) 24static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 54
55 paravirt_alloc_ldt(newldt, mincount);
56
54#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
56 wmb(); 59 wmb();
@@ -62,12 +65,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 65
63 if (reload) { 66 if (reload) {
64#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
65 cpumask_of_cpu_ptr_declare(mask);
66
67 preempt_disable(); 68 preempt_disable();
68 load_LDT(pc); 69 load_LDT(pc);
69 cpumask_of_cpu_ptr_next(mask, smp_processor_id()); 70 if (!cpus_equal(current->mm->cpu_vm_mask,
70 if (!cpus_equal(current->mm->cpu_vm_mask, *mask)) 71 cpumask_of_cpu(smp_processor_id())))
71 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 73 preempt_enable();
73#else 74#else
@@ -75,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
75#endif 76#endif
76 } 77 }
77 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
79 vfree(oldldt); 81 vfree(oldldt);
80 else 82 else
@@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
86static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
87{ 89{
88 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
89 92
90 if (err < 0) 93 if (err < 0)
91 return err; 94 return err;
92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
93 return 0; 98 return 0;
94} 99}
95 100
@@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
126 if (mm == current->active_mm) 131 if (mm == current->active_mm)
127 clear_LDT(); 132 clear_LDT();
128#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
130 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
131 else 137 else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
22#include <asm/cpufeature.h> 23#include <asm/cpufeature.h>
23#include <asm/desc.h> 24#include <asm/desc.h>
24#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/cacheflush.h>
25 27
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 29static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
77/* 79/*
78 * A architecture hook called to validate the 80 * A architecture hook called to validate the
79 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
80 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
81 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
82 * been copied into the kernel. 84 * been copied into the kernel.
83 * 85 *
@@ -85,10 +87,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 87 * reboot code buffer to allow us to avoid allocations
86 * later. 88 * later.
87 * 89 *
88 * Currently nothing. 90 * Make control page executable.
89 */ 91 */
90int machine_kexec_prepare(struct kimage *image) 92int machine_kexec_prepare(struct kimage *image)
91{ 93{
94 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1);
92 return 0; 96 return 0;
93} 97}
94 98
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 102 */
99void machine_kexec_cleanup(struct kimage *image) 103void machine_kexec_cleanup(struct kimage *image)
100{ 104{
105 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1);
101} 107}
102 108
103/* 109/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 110 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 111 * We are past the point of no return, committed to rebooting now.
106 */ 112 */
107NORET_TYPE void machine_kexec(struct kimage *image) 113void machine_kexec(struct kimage *image)
108{ 114{
109 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
110 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
118 asmlinkage unsigned long
119 (*relocate_kernel_ptr)(unsigned long indirection_page,
120 unsigned long control_page,
121 unsigned long start_address,
122 unsigned int has_pae,
123 unsigned int preserve_context);
124
125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
111 129
112 tracer_disable(); 130 save_ftrace_enabled = __ftrace_enabled_save();
113 131
114 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 133 local_irq_disable();
116 134
135 if (image->preserve_context) {
136#ifdef CONFIG_X86_IO_APIC
137 /* We need to put APICs in legacy mode so that we can
138 * get timer interrupts in second kernel. kexec/kdump
139 * paths already have calls to disable_IO_APIC() in
140 * one form or other. kexec jump path also need
141 * one.
142 */
143 disable_IO_APIC();
144#endif
145 }
146
117 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
119 149
150 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 153 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 155#ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 163 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 166
135 /* The segment registers are funny things, they have both a 167 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 168 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 181 set_idt(phys_to_virt(0),0);
150 182
151 /* now call it */ 183 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 184 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 185 (unsigned long)page_list,
186 image->start, cpu_has_pae,
187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
154} 195}
155 196
156void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 6994c751590e..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,860 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391 cpumask_of_cpu_ptr_declare(newmask);
392
393 old = current->cpus_allowed;
394
395 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
396 error = microcode_sanity_check(new_mc);
397 if (error)
398 goto out;
399 /*
400 * It's possible the data file has multiple matching ucode,
401 * lets keep searching till the latest version
402 */
403 for_each_online_cpu(cpu) {
404 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
405
406 if (!uci->valid)
407 continue;
408 cpumask_of_cpu_ptr_next(newmask, cpu);
409 set_cpus_allowed_ptr(current, newmask);
410 error = get_maching_microcode(new_mc, cpu);
411 if (error < 0)
412 goto out;
413 if (error == 1)
414 apply_microcode(cpu);
415 }
416 vfree(new_mc);
417 }
418out:
419 if (cursor > 0)
420 vfree(new_mc);
421 if (cursor < 0)
422 error = cursor;
423 set_cpus_allowed_ptr(current, &old);
424 return error;
425}
426
427static int microcode_open (struct inode *unused1, struct file *unused2)
428{
429 cycle_kernel_lock();
430 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
431}
432
433static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
434{
435 ssize_t ret;
436
437 if ((len >> PAGE_SHIFT) > num_physpages) {
438 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
439 return -EINVAL;
440 }
441
442 get_online_cpus();
443 mutex_lock(&microcode_mutex);
444
445 user_buffer = (void __user *) buf;
446 user_buffer_size = (int) len;
447
448 ret = do_microcode_update();
449 if (!ret)
450 ret = (ssize_t)len;
451
452 mutex_unlock(&microcode_mutex);
453 put_online_cpus();
454
455 return ret;
456}
457
458static const struct file_operations microcode_fops = {
459 .owner = THIS_MODULE,
460 .write = microcode_write,
461 .open = microcode_open,
462};
463
464static struct miscdevice microcode_dev = {
465 .minor = MICROCODE_MINOR,
466 .name = "microcode",
467 .fops = &microcode_fops,
468};
469
470static int __init microcode_dev_init (void)
471{
472 int error;
473
474 error = misc_register(&microcode_dev);
475 if (error) {
476 printk(KERN_ERR
477 "microcode: can't misc_register on minor=%d\n",
478 MICROCODE_MINOR);
479 return error;
480 }
481
482 return 0;
483}
484
485static void microcode_dev_exit (void)
486{
487 misc_deregister(&microcode_dev);
488}
489
490MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
491#else
492#define microcode_dev_init() 0
493#define microcode_dev_exit() do { } while(0)
494#endif
495
496static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
497 unsigned long size, long offset)
498{
499 microcode_header_t *mc_header;
500 unsigned long total_size;
501
502 /* No more data */
503 if (offset >= size)
504 return 0;
505 mc_header = (microcode_header_t *)(buf + offset);
506 total_size = get_totalsize(mc_header);
507
508 if (offset + total_size > size) {
509 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
510 return -EINVAL;
511 }
512
513 *mc = vmalloc(total_size);
514 if (!*mc) {
515 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
516 return -ENOMEM;
517 }
518 memcpy(*mc, buf + offset, total_size);
519 return offset + total_size;
520}
521
522/* fake device for request_firmware */
523static struct platform_device *microcode_pdev;
524
525static int cpu_request_microcode(int cpu)
526{
527 char name[30];
528 struct cpuinfo_x86 *c = &cpu_data(cpu);
529 const struct firmware *firmware;
530 const u8 *buf;
531 unsigned long size;
532 long offset = 0;
533 int error;
534 void *mc;
535
536 /* We should bind the task to the CPU */
537 BUG_ON(cpu != raw_smp_processor_id());
538 sprintf(name,"intel-ucode/%02x-%02x-%02x",
539 c->x86, c->x86_model, c->x86_mask);
540 error = request_firmware(&firmware, name, &microcode_pdev->dev);
541 if (error) {
542 pr_debug("microcode: data file %s load failed\n", name);
543 return error;
544 }
545 buf = firmware->data;
546 size = firmware->size;
547 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
548 > 0) {
549 error = microcode_sanity_check(mc);
550 if (error)
551 break;
552 error = get_maching_microcode(mc, cpu);
553 if (error < 0)
554 break;
555 /*
556 * It's possible the data file has multiple matching ucode,
557 * lets keep searching till the latest version
558 */
559 if (error == 1) {
560 apply_microcode(cpu);
561 error = 0;
562 }
563 vfree(mc);
564 }
565 if (offset > 0)
566 vfree(mc);
567 if (offset < 0)
568 error = offset;
569 release_firmware(firmware);
570
571 return error;
572}
573
574static int apply_microcode_check_cpu(int cpu)
575{
576 struct cpuinfo_x86 *c = &cpu_data(cpu);
577 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
578 cpumask_t old;
579 cpumask_of_cpu_ptr(newmask, cpu);
580 unsigned int val[2];
581 int err = 0;
582
583 /* Check if the microcode is available */
584 if (!uci->mc)
585 return 0;
586
587 old = current->cpus_allowed;
588 set_cpus_allowed_ptr(current, newmask);
589
590 /* Check if the microcode we have in memory matches the CPU */
591 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
592 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
593 err = -EINVAL;
594
595 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
596 /* get processor flags from MSR 0x17 */
597 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
598 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
599 err = -EINVAL;
600 }
601
602 if (!err) {
603 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
604 /* see notes above for revision 1.07. Apparent chip bug */
605 sync_core();
606 /* get the current revision from MSR 0x8B */
607 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
608 if (uci->rev != val[1])
609 err = -EINVAL;
610 }
611
612 if (!err)
613 apply_microcode(cpu);
614 else
615 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
616 " sig=0x%x, pf=0x%x, rev=0x%x\n",
617 cpu, uci->sig, uci->pf, uci->rev);
618
619 set_cpus_allowed_ptr(current, &old);
620 return err;
621}
622
623static void microcode_init_cpu(int cpu, int resume)
624{
625 cpumask_t old;
626 cpumask_of_cpu_ptr(newmask, cpu);
627 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
628
629 old = current->cpus_allowed;
630
631 set_cpus_allowed_ptr(current, newmask);
632 mutex_lock(&microcode_mutex);
633 collect_cpu_info(cpu);
634 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
635 cpu_request_microcode(cpu);
636 mutex_unlock(&microcode_mutex);
637 set_cpus_allowed_ptr(current, &old);
638}
639
640static void microcode_fini_cpu(int cpu)
641{
642 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
643
644 mutex_lock(&microcode_mutex);
645 uci->valid = 0;
646 vfree(uci->mc);
647 uci->mc = NULL;
648 mutex_unlock(&microcode_mutex);
649}
650
651static ssize_t reload_store(struct sys_device *dev,
652 struct sysdev_attribute *attr,
653 const char *buf, size_t sz)
654{
655 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
656 char *end;
657 unsigned long val = simple_strtoul(buf, &end, 0);
658 int err = 0;
659 int cpu = dev->id;
660
661 if (end == buf)
662 return -EINVAL;
663 if (val == 1) {
664 cpumask_t old;
665 cpumask_of_cpu_ptr(newmask, cpu);
666
667 old = current->cpus_allowed;
668
669 get_online_cpus();
670 set_cpus_allowed_ptr(current, newmask);
671
672 mutex_lock(&microcode_mutex);
673 if (uci->valid)
674 err = cpu_request_microcode(cpu);
675 mutex_unlock(&microcode_mutex);
676 put_online_cpus();
677 set_cpus_allowed_ptr(current, &old);
678 }
679 if (err)
680 return err;
681 return sz;
682}
683
684static ssize_t version_show(struct sys_device *dev,
685 struct sysdev_attribute *attr, char *buf)
686{
687 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
688
689 return sprintf(buf, "0x%x\n", uci->rev);
690}
691
692static ssize_t pf_show(struct sys_device *dev,
693 struct sysdev_attribute *attr, char *buf)
694{
695 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
696
697 return sprintf(buf, "0x%x\n", uci->pf);
698}
699
700static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
701static SYSDEV_ATTR(version, 0400, version_show, NULL);
702static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
703
704static struct attribute *mc_default_attrs[] = {
705 &attr_reload.attr,
706 &attr_version.attr,
707 &attr_processor_flags.attr,
708 NULL
709};
710
711static struct attribute_group mc_attr_group = {
712 .attrs = mc_default_attrs,
713 .name = "microcode",
714};
715
716static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
717{
718 int err, cpu = sys_dev->id;
719 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
720
721 if (!cpu_online(cpu))
722 return 0;
723
724 pr_debug("microcode: CPU%d added\n", cpu);
725 memset(uci, 0, sizeof(*uci));
726
727 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
728 if (err)
729 return err;
730
731 microcode_init_cpu(cpu, resume);
732
733 return 0;
734}
735
736static int mc_sysdev_add(struct sys_device *sys_dev)
737{
738 return __mc_sysdev_add(sys_dev, 0);
739}
740
741static int mc_sysdev_remove(struct sys_device *sys_dev)
742{
743 int cpu = sys_dev->id;
744
745 if (!cpu_online(cpu))
746 return 0;
747
748 pr_debug("microcode: CPU%d removed\n", cpu);
749 microcode_fini_cpu(cpu);
750 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
751 return 0;
752}
753
754static int mc_sysdev_resume(struct sys_device *dev)
755{
756 int cpu = dev->id;
757
758 if (!cpu_online(cpu))
759 return 0;
760 pr_debug("microcode: CPU%d resumed\n", cpu);
761 /* only CPU 0 will apply ucode here */
762 apply_microcode(0);
763 return 0;
764}
765
766static struct sysdev_driver mc_sysdev_driver = {
767 .add = mc_sysdev_add,
768 .remove = mc_sysdev_remove,
769 .resume = mc_sysdev_resume,
770};
771
772static __cpuinit int
773mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
774{
775 unsigned int cpu = (unsigned long)hcpu;
776 struct sys_device *sys_dev;
777
778 sys_dev = get_cpu_sysdev(cpu);
779 switch (action) {
780 case CPU_UP_CANCELED_FROZEN:
781 /* The CPU refused to come up during a system resume */
782 microcode_fini_cpu(cpu);
783 break;
784 case CPU_ONLINE:
785 case CPU_DOWN_FAILED:
786 mc_sysdev_add(sys_dev);
787 break;
788 case CPU_ONLINE_FROZEN:
789 /* System-wide resume is in progress, try to apply microcode */
790 if (apply_microcode_check_cpu(cpu)) {
791 /* The application of microcode failed */
792 microcode_fini_cpu(cpu);
793 __mc_sysdev_add(sys_dev, 1);
794 break;
795 }
796 case CPU_DOWN_FAILED_FROZEN:
797 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
798 printk(KERN_ERR "microcode: Failed to create the sysfs "
799 "group for CPU%d\n", cpu);
800 break;
801 case CPU_DOWN_PREPARE:
802 mc_sysdev_remove(sys_dev);
803 break;
804 case CPU_DOWN_PREPARE_FROZEN:
805 /* Suspend is in progress, only remove the interface */
806 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
807 break;
808 }
809 return NOTIFY_OK;
810}
811
812static struct notifier_block __refdata mc_cpu_notifier = {
813 .notifier_call = mc_cpu_callback,
814};
815
816static int __init microcode_init (void)
817{
818 int error;
819
820 printk(KERN_INFO
821 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
822
823 error = microcode_dev_init();
824 if (error)
825 return error;
826 microcode_pdev = platform_device_register_simple("microcode", -1,
827 NULL, 0);
828 if (IS_ERR(microcode_pdev)) {
829 microcode_dev_exit();
830 return PTR_ERR(microcode_pdev);
831 }
832
833 get_online_cpus();
834 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
835 put_online_cpus();
836 if (error) {
837 microcode_dev_exit();
838 platform_device_unregister(microcode_pdev);
839 return error;
840 }
841
842 register_hotcpu_notifier(&mc_cpu_notifier);
843 return 0;
844}
845
846static void __exit microcode_exit (void)
847{
848 microcode_dev_exit();
849
850 unregister_hotcpu_notifier(&mc_cpu_notifier);
851
852 get_online_cpus();
853 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
854 put_online_cpus();
855
856 platform_device_unregister(microcode_pdev);
857}
858
859module_init(microcode_init)
860module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
238 {} 238 {}
239}; 239};
240 240
241void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
242{ 242{
243 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
244} 244}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6ae005ccaed8..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -49,7 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
49 return sum & 0xFF; 49 return sum & 0xFF;
50} 50}
51 51
52static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __init MP_processor_info(struct mpc_config_processor *m)
53{ 53{
54 int apicid; 54 int apicid;
55 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -83,7 +83,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
83 if (x86_quirks->mpc_oem_bus_info) 83 if (x86_quirks->mpc_oem_bus_info)
84 x86_quirks->mpc_oem_bus_info(m, str); 84 x86_quirks->mpc_oem_bus_info(m, str);
85 else 85 else
86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
87 87
88#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
89 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -154,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
154 154
155static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
156{ 156{
157 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
158 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
159 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -163,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
163 163
164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
165{ 165{
166 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
167 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -235,7 +235,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
235 235
236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
237{ 237{
238 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
240 m->mpc_irqtype, m->mpc_irqflag & 3, 240 m->mpc_irqtype, m->mpc_irqflag & 3,
241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
397 generic_bigsmp_probe(); 397 generic_bigsmp_probe();
398#endif 398#endif
399 399
400#ifdef CONFIG_X86_32
400 setup_apic_routing(); 401 setup_apic_routing();
402#endif
401 if (!num_processors) 403 if (!num_processors)
402 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
403 return num_processors; 405 return num_processors;
@@ -484,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
484} 486}
485 487
486 488
487static void construct_ioapic_table(int mpc_default_type) 489static void __init construct_ioapic_table(int mpc_default_type)
488{ 490{
489 struct mpc_config_ioapic ioapic; 491 struct mpc_config_ioapic ioapic;
490 struct mpc_config_bus bus; 492 struct mpc_config_bus bus;
@@ -529,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type)
529 construct_default_ioirq_mptable(mpc_default_type); 531 construct_default_ioirq_mptable(mpc_default_type);
530} 532}
531#else 533#else
532static inline void construct_ioapic_table(int mpc_default_type) { } 534static inline void __init construct_ioapic_table(int mpc_default_type) { }
533#endif 535#endif
534 536
535static inline void __init construct_default_ISA_mptable(int mpc_default_type) 537static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -695,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
695 unsigned int *bp = phys_to_virt(base); 697 unsigned int *bp = phys_to_virt(base);
696 struct intel_mp_floating *mpf; 698 struct intel_mp_floating *mpf;
697 699
698 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); 700 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
701 bp, length);
699 BUILD_BUG_ON(sizeof(*mpf) != 16); 702 BUILD_BUG_ON(sizeof(*mpf) != 16);
700 703
701 while (length > 0) { 704 while (length > 0) {
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9fd809552447..2e2af5d18191 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
131 ret = -EIO; /* MSR not supported */ 145 ret = -EIO; /* MSR not supported */
132out: 146out:
133 unlock_kernel(); 147 unlock_kernel();
134 return 0; 148 return ret;
135} 149}
136 150
137/* 151/*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ac6d51222e7d..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
114} 114}
115#endif 115#endif
116 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
117int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
118{ 135{
119 unsigned int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
141 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
142 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
143 continue; 160 continue;
144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
145 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
146 "appears to be stuck (%d->%d)!\n",
147 cpu,
148 prev_nmi_count[cpu],
149 get_nmi_count(cpu));
150 per_cpu(wd_enabled, cpu) = 0;
151 atomic_dec(&nmi_active);
152 }
153 } 163 }
154 endflag = 1; 164 endflag = 1;
155 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
@@ -289,6 +299,15 @@ void acpi_nmi_disable(void)
289 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
290} 300}
291 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
292void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
293{ 312{
294 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
301 320
302 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
303 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
304 /* enable it before to avoid race with handler */
305 __get_cpu_var(wd_enabled) = 1;
306 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
307 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
308 return; 325 return;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index b8c45610b20a..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -73,7 +73,7 @@ static void __init smp_dump_qct(void)
73} 73}
74 74
75 75
76void __init numaq_tsc_disable(void) 76void __cpuinit numaq_tsc_disable(void)
77{ 77{
78 if (!found_numaq) 78 if (!found_numaq)
79 return; 79 return;
@@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
229 } 229 }
230} 230}
231 231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
232static struct x86_quirks numaq_x86_quirks __initdata = { 238static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init, 239 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL, 240 .arch_time_init = NULL,
@@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
243 .mpc_oem_bus_info = mpc_oem_bus_info, 249 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus, 250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem, 251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
246}; 253};
247 254
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 94da4d52d798..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -330,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
330#endif 319#endif
331 .wbinvd = native_wbinvd, 320 .wbinvd = native_wbinvd,
332 .read_msr = native_read_msr_safe, 321 .read_msr = native_read_msr_safe,
322 .read_msr_amd = native_read_msr_amd_safe,
333 .write_msr = native_write_msr_safe, 323 .write_msr = native_write_msr_safe,
334 .read_tsc = native_read_tsc, 324 .read_tsc = native_read_tsc,
335 .read_pmc = native_read_pmc, 325 .read_pmc = native_read_pmc,
@@ -348,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
348 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
349 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
350 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
351 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
352 346
353#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -373,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
373 367
374struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
375#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
376 .apic_write = native_apic_write,
377 .apic_read = native_apic_read,
378 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
379 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
380 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -461,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
461 .set_fixmap = native_set_fixmap, 453 .set_fixmap = native_set_fixmap,
462}; 454};
463 455
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL_GPL(pv_lock_ops);
475
476EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
477EXPORT_SYMBOL (pv_cpu_ops); 457EXPORT_SYMBOL (pv_cpu_ops);
478EXPORT_SYMBOL (pv_mmu_ops); 458EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 19e7fc7c2c4f..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
39#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
40
40#include <asm/iommu.h> 41#include <asm/iommu.h>
41#include <asm/calgary.h> 42#include <asm/calgary.h>
42#include <asm/tce.h> 43#include <asm/tce.h>
@@ -260,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
260 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
261 } 262 }
262 263
263 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
264 265
265 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
266} 267}
@@ -342,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
342 /* were we called with bad_dma_address? */ 343 /* were we called with bad_dma_address? */
343 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 344 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
344 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 345 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
345 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 346 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
346 "address 0x%Lx\n", dma_addr); 347 "address 0x%Lx\n", dma_addr);
347 WARN_ON(1);
348 return; 348 return;
349 } 349 }
350 350
@@ -413,22 +413,6 @@ static void calgary_unmap_sg(struct device *dev,
413 } 413 }
414} 414}
415 415
416static int calgary_nontranslate_map_sg(struct device* dev,
417 struct scatterlist *sg, int nelems, int direction)
418{
419 struct scatterlist *s;
420 int i;
421
422 for_each_sg(sg, s, nelems, i) {
423 struct page *p = sg_page(s);
424
425 BUG_ON(!p);
426 s->dma_address = virt_to_bus(sg_virt(s));
427 s->dma_length = s->length;
428 }
429 return nelems;
430}
431
432static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 416static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
433 int nelems, int direction) 417 int nelems, int direction)
434{ 418{
@@ -439,9 +423,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
439 unsigned long entry; 423 unsigned long entry;
440 int i; 424 int i;
441 425
442 if (!translation_enabled(tbl))
443 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
444
445 for_each_sg(sg, s, nelems, i) { 426 for_each_sg(sg, s, nelems, i) {
446 BUG_ON(!sg_page(s)); 427 BUG_ON(!sg_page(s));
447 428
@@ -477,7 +458,6 @@ error:
477static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 458static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
478 size_t size, int direction) 459 size_t size, int direction)
479{ 460{
480 dma_addr_t dma_handle = bad_dma_address;
481 void *vaddr = phys_to_virt(paddr); 461 void *vaddr = phys_to_virt(paddr);
482 unsigned long uaddr; 462 unsigned long uaddr;
483 unsigned int npages; 463 unsigned int npages;
@@ -486,12 +466,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
486 uaddr = (unsigned long)vaddr; 466 uaddr = (unsigned long)vaddr;
487 npages = num_dma_pages(uaddr, size); 467 npages = num_dma_pages(uaddr, size);
488 468
489 if (translation_enabled(tbl)) 469 return iommu_alloc(dev, tbl, vaddr, npages, direction);
490 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
491 else
492 dma_handle = virt_to_bus(vaddr);
493
494 return dma_handle;
495} 470}
496 471
497static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 472static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -500,9 +475,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
500 struct iommu_table *tbl = find_iommu_table(dev); 475 struct iommu_table *tbl = find_iommu_table(dev);
501 unsigned int npages; 476 unsigned int npages;
502 477
503 if (!translation_enabled(tbl))
504 return;
505
506 npages = num_dma_pages(dma_handle, size); 478 npages = num_dma_pages(dma_handle, size);
507 iommu_free(tbl, dma_handle, npages); 479 iommu_free(tbl, dma_handle, npages);
508} 480}
@@ -519,24 +491,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
519 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
520 order = get_order(size); 492 order = get_order(size);
521 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
522 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
523 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
524 if (!ret) 498 if (!ret)
525 goto error; 499 goto error;
526 memset(ret, 0, size); 500 memset(ret, 0, size);
527 501
528 if (translation_enabled(tbl)) { 502 /* set up tces to cover the allocated range */
529 /* set up tces to cover the allocated range */ 503 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
530 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 504 if (mapping == bad_dma_address)
531 if (mapping == bad_dma_address) 505 goto free;
532 goto free; 506 *dma_handle = mapping;
533
534 *dma_handle = mapping;
535 } else /* non translated slot */
536 *dma_handle = virt_to_bus(ret);
537
538 return ret; 507 return ret;
539
540free: 508free:
541 free_pages((unsigned long)ret, get_order(size)); 509 free_pages((unsigned long)ret, get_order(size));
542 ret = NULL; 510 ret = NULL;
@@ -544,8 +512,22 @@ error:
544 return ret; 512 return ret;
545} 513}
546 514
547static const struct dma_mapping_ops calgary_dma_ops = { 515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
528static struct dma_mapping_ops calgary_dma_ops = {
548 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
549 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
550 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
551 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
@@ -1241,6 +1223,16 @@ static int __init calgary_init(void)
1241 goto error; 1223 goto error;
1242 } while (1); 1224 } while (1);
1243 1225
1226 dev = NULL;
1227 for_each_pci_dev(dev) {
1228 struct iommu_table *tbl;
1229
1230 tbl = find_iommu_table(&dev->dev);
1231
1232 if (translation_enabled(tbl))
1233 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1234 }
1235
1244 return ret; 1236 return ret;
1245 1237
1246error: 1238error:
@@ -1262,6 +1254,7 @@ error:
1262 calgary_disable_translation(dev); 1254 calgary_disable_translation(dev);
1263 calgary_free_bus(dev); 1255 calgary_free_bus(dev);
1264 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1256 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1257 dev->dev.archdata.dma_ops = NULL;
1265 } while (1); 1258 } while (1);
1266 1259
1267 return ret; 1260 return ret;
@@ -1291,13 +1284,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1291static int __init build_detail_arrays(void) 1284static int __init build_detail_arrays(void)
1292{ 1285{
1293 unsigned long ptr; 1286 unsigned long ptr;
1294 int i, scal_detail_size, rio_detail_size; 1287 unsigned numnodes, i;
1288 int scal_detail_size, rio_detail_size;
1295 1289
1296 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1290 numnodes = rio_table_hdr->num_scal_dev;
1291 if (numnodes > MAX_NUMNODES){
1297 printk(KERN_WARNING 1292 printk(KERN_WARNING
1298 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1293 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1299 "but system has %d nodes.\n", 1294 "but system has %d nodes.\n",
1300 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1295 MAX_NUMNODES, numnodes);
1301 return -ENODEV; 1296 return -ENODEV;
1302 } 1297 }
1303 1298
@@ -1318,8 +1313,7 @@ static int __init build_detail_arrays(void)
1318 } 1313 }
1319 1314
1320 ptr = ((unsigned long)rio_table_hdr) + 3; 1315 ptr = ((unsigned long)rio_table_hdr) + 3;
1321 for (i = 0; i < rio_table_hdr->num_scal_dev; 1316 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1322 i++, ptr += scal_detail_size)
1323 scal_devs[i] = (struct scal_detail *)ptr; 1317 scal_devs[i] = (struct scal_detail *)ptr;
1324 1318
1325 for (i = 0; i < rio_table_hdr->num_rio_dev; 1319 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1372,7 +1366,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1372 * Function for kdump case. Get the tce tables from first kernel 1366 * Function for kdump case. Get the tce tables from first kernel
1373 * by reading the contents of the base adress register of calgary iommu 1367 * by reading the contents of the base adress register of calgary iommu
1374 */ 1368 */
1375static void get_tce_space_from_tar() 1369static void __init get_tce_space_from_tar(void)
1376{ 1370{
1377 int bus; 1371 int bus;
1378 void __iomem *target; 1372 void __iomem *target;
@@ -1503,6 +1497,10 @@ void __init detect_calgary(void)
1503 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1497 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1504 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1498 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1505 debugging ? "enabled" : "disabled"); 1499 debugging ? "enabled" : "disabled");
1500
1501 /* swiotlb for devices that aren't behind the Calgary. */
1502 if (max_pfn > MAX_DMA32_PFN)
1503 swiotlb = 1;
1506 } 1504 }
1507 return; 1505 return;
1508 1506
@@ -1519,7 +1517,7 @@ int __init calgary_iommu_init(void)
1519{ 1517{
1520 int ret; 1518 int ret;
1521 1519
1522 if (no_iommu || swiotlb) 1520 if (no_iommu || (swiotlb && !calgary_detected))
1523 return -ENODEV; 1521 return -ENODEV;
1524 1522
1525 if (!calgary_detected) 1523 if (!calgary_detected)
@@ -1532,15 +1530,14 @@ int __init calgary_iommu_init(void)
1532 if (ret) { 1530 if (ret) {
1533 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1531 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1534 "falling back to no_iommu\n", ret); 1532 "falling back to no_iommu\n", ret);
1535 if (max_pfn > MAX_DMA32_PFN)
1536 printk(KERN_ERR "WARNING more than 4GB of memory, "
1537 "32bit PCI may malfunction.\n");
1538 return ret; 1533 return ret;
1539 } 1534 }
1540 1535
1541 force_iommu = 1; 1536 force_iommu = 1;
1542 bad_dma_address = 0x0; 1537 bad_dma_address = 0x0;
1543 dma_ops = &calgary_dma_ops; 1538 /* dma_ops is set to swiotlb or nommu */
1539 if (!dma_ops)
1540 dma_ops = &nommu_dma_ops;
1544 1541
1545 return 0; 1542 return 0;
1546} 1543}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cbecb05551bb..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
11 11
12static int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13 13
14const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
16 16
17static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void)
82 * using 512M as goal 83 * using 512M as goal
83 */ 84 */
84 align = 64ULL<<20; 85 align = 64ULL<<20;
85 size = round_up(dma32_bootmem_size, align); 86 size = roundup(dma32_bootmem_size, align);
86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
87 512ULL<<20); 88 512ULL<<20);
88 if (dma32_bootmem_ptr) 89 if (dma32_bootmem_ptr)
@@ -123,8 +124,47 @@ void __init pci_iommu_alloc(void)
123 124
124 pci_swiotlb_init(); 125 pci_swiotlb_init();
125} 126}
127
128unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_num_pages);
126#endif 135#endif
127 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
128/* 168/*
129 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
130 * documentation. 170 * documentation.
@@ -192,126 +232,10 @@ static __init int iommu_setup(char *p)
192} 232}
193early_param("iommu", iommu_setup); 233early_param("iommu", iommu_setup);
194 234
195#ifdef CONFIG_X86_32
196int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
197 dma_addr_t device_addr, size_t size, int flags)
198{
199 void __iomem *mem_base = NULL;
200 int pages = size >> PAGE_SHIFT;
201 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
202
203 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
204 goto out;
205 if (!size)
206 goto out;
207 if (dev->dma_mem)
208 goto out;
209
210 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
211
212 mem_base = ioremap(bus_addr, size);
213 if (!mem_base)
214 goto out;
215
216 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
217 if (!dev->dma_mem)
218 goto out;
219 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
220 if (!dev->dma_mem->bitmap)
221 goto free1_out;
222
223 dev->dma_mem->virt_base = mem_base;
224 dev->dma_mem->device_base = device_addr;
225 dev->dma_mem->size = pages;
226 dev->dma_mem->flags = flags;
227
228 if (flags & DMA_MEMORY_MAP)
229 return DMA_MEMORY_MAP;
230
231 return DMA_MEMORY_IO;
232
233 free1_out:
234 kfree(dev->dma_mem);
235 out:
236 if (mem_base)
237 iounmap(mem_base);
238 return 0;
239}
240EXPORT_SYMBOL(dma_declare_coherent_memory);
241
242void dma_release_declared_memory(struct device *dev)
243{
244 struct dma_coherent_mem *mem = dev->dma_mem;
245
246 if (!mem)
247 return;
248 dev->dma_mem = NULL;
249 iounmap(mem->virt_base);
250 kfree(mem->bitmap);
251 kfree(mem);
252}
253EXPORT_SYMBOL(dma_release_declared_memory);
254
255void *dma_mark_declared_memory_occupied(struct device *dev,
256 dma_addr_t device_addr, size_t size)
257{
258 struct dma_coherent_mem *mem = dev->dma_mem;
259 int pos, err;
260 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
261
262 pages >>= PAGE_SHIFT;
263
264 if (!mem)
265 return ERR_PTR(-EINVAL);
266
267 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
268 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
269 if (err != 0)
270 return ERR_PTR(err);
271 return mem->virt_base + (pos << PAGE_SHIFT);
272}
273EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
274
275static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
276 dma_addr_t *dma_handle, void **ret)
277{
278 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
279 int order = get_order(size);
280
281 if (mem) {
282 int page = bitmap_find_free_region(mem->bitmap, mem->size,
283 order);
284 if (page >= 0) {
285 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
286 *ret = mem->virt_base + (page << PAGE_SHIFT);
287 memset(*ret, 0, size);
288 }
289 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
290 *ret = NULL;
291 }
292 return (mem != NULL);
293}
294
295static int dma_release_coherent(struct device *dev, int order, void *vaddr)
296{
297 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
298
299 if (mem && vaddr >= mem->virt_base && vaddr <
300 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
301 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
302
303 bitmap_release_region(mem->bitmap, page, order);
304 return 1;
305 }
306 return 0;
307}
308#else
309#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
310#define dma_release_coherent(dev, order, vaddr) (0)
311#endif /* CONFIG_X86_32 */
312
313int dma_supported(struct device *dev, u64 mask) 235int dma_supported(struct device *dev, u64 mask)
314{ 236{
237 struct dma_mapping_ops *ops = get_dma_ops(dev);
238
315#ifdef CONFIG_PCI 239#ifdef CONFIG_PCI
316 if (mask > 0xffffffff && forbid_dac > 0) { 240 if (mask > 0xffffffff && forbid_dac > 0) {
317 dev_info(dev, "PCI: Disallowing DAC for device\n"); 241 dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -319,8 +243,8 @@ int dma_supported(struct device *dev, u64 mask)
319 } 243 }
320#endif 244#endif
321 245
322 if (dma_ops->dma_supported) 246 if (ops->dma_supported)
323 return dma_ops->dma_supported(dev, mask); 247 return ops->dma_supported(dev, mask);
324 248
325 /* Copied from i386. Doesn't make much sense, because it will 249 /* Copied from i386. Doesn't make much sense, because it will
326 only work for pci_alloc_coherent. 250 only work for pci_alloc_coherent.
@@ -349,144 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
349} 273}
350EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
351 275
352/* Allocate DMA memory on node near device */
353static noinline struct page *
354dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
355{
356 int node;
357
358 node = dev_to_node(dev);
359
360 return alloc_pages_node(node, gfp, order);
361}
362
363/*
364 * Allocate memory for a coherent mapping.
365 */
366void *
367dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
368 gfp_t gfp)
369{
370 void *memory = NULL;
371 struct page *page;
372 unsigned long dma_mask = 0;
373 dma_addr_t bus;
374 int noretry = 0;
375
376 /* ignore region specifiers */
377 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
378
379 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
380 return memory;
381
382 if (!dev) {
383 dev = &fallback_dev;
384 gfp |= GFP_DMA;
385 }
386 dma_mask = dev->coherent_dma_mask;
387 if (dma_mask == 0)
388 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
389
390 /* Device not DMA able */
391 if (dev->dma_mask == NULL)
392 return NULL;
393
394 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
395 if (gfp & __GFP_DMA)
396 noretry = 1;
397
398#ifdef CONFIG_X86_64
399 /* Why <=? Even when the mask is smaller than 4GB it is often
400 larger than 16MB and in this case we have a chance of
401 finding fitting memory in the next higher zone first. If
402 not retry with true GFP_DMA. -AK */
403 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
404 gfp |= GFP_DMA32;
405 if (dma_mask < DMA_32BIT_MASK)
406 noretry = 1;
407 }
408#endif
409
410 again:
411 page = dma_alloc_pages(dev,
412 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
413 if (page == NULL)
414 return NULL;
415
416 {
417 int high, mmu;
418 bus = page_to_phys(page);
419 memory = page_address(page);
420 high = (bus + size) >= dma_mask;
421 mmu = high;
422 if (force_iommu && !(gfp & GFP_DMA))
423 mmu = 1;
424 else if (high) {
425 free_pages((unsigned long)memory,
426 get_order(size));
427
428 /* Don't use the 16MB ZONE_DMA unless absolutely
429 needed. It's better to use remapping first. */
430 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
431 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
432 goto again;
433 }
434
435 /* Let low level make its own zone decisions */
436 gfp &= ~(GFP_DMA32|GFP_DMA);
437
438 if (dma_ops->alloc_coherent)
439 return dma_ops->alloc_coherent(dev, size,
440 dma_handle, gfp);
441 return NULL;
442 }
443
444 memset(memory, 0, size);
445 if (!mmu) {
446 *dma_handle = bus;
447 return memory;
448 }
449 }
450
451 if (dma_ops->alloc_coherent) {
452 free_pages((unsigned long)memory, get_order(size));
453 gfp &= ~(GFP_DMA|GFP_DMA32);
454 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
455 }
456
457 if (dma_ops->map_simple) {
458 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
459 size,
460 PCI_DMA_BIDIRECTIONAL);
461 if (*dma_handle != bad_dma_address)
462 return memory;
463 }
464
465 if (panic_on_overflow)
466 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
467 (unsigned long)size);
468 free_pages((unsigned long)memory, get_order(size));
469 return NULL;
470}
471EXPORT_SYMBOL(dma_alloc_coherent);
472
473/*
474 * Unmap coherent memory.
475 * The caller must ensure that the device has finished accessing the mapping.
476 */
477void dma_free_coherent(struct device *dev, size_t size,
478 void *vaddr, dma_addr_t bus)
479{
480 int order = get_order(size);
481 WARN_ON(irqs_disabled()); /* for portability */
482 if (dma_release_coherent(dev, order, vaddr))
483 return;
484 if (dma_ops->unmap_single)
485 dma_ops->unmap_single(dev, bus, size, 0);
486 free_pages((unsigned long)vaddr, order);
487}
488EXPORT_SYMBOL(dma_free_coherent);
489
490static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
491{ 277{
492 calgary_iommu_init(); 278 calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index df5f142657d2..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -67,9 +67,6 @@ static u32 gart_unmapped_entry;
67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
69 69
70#define to_pages(addr, size) \
71 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
72
73#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
74 71
75#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -83,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
83AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
84 81
85static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
86static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
87 84
88static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
89{ 87{
90 unsigned long offset, flags; 88 unsigned long offset, flags;
91 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -93,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
93 91
94 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
95 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
96 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
97 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
98 96
99 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
100 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
101 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
102 if (offset == -1) { 100 if (offset == -1) {
103 need_flush = 1; 101 need_flush = true;
104 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
105 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
106 } 105 }
107 if (offset != -1) { 106 if (offset != -1) {
108 next_bit = offset+size; 107 next_bit = offset+size;
109 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
110 next_bit = 0; 109 next_bit = 0;
111 need_flush = 1; 110 need_flush = true;
112 } 111 }
113 } 112 }
114 if (iommu_fullflush) 113 if (iommu_fullflush)
115 need_flush = 1; 114 need_flush = true;
116 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
117 116
118 return offset; 117 return offset;
@@ -137,7 +136,7 @@ static void flush_gart(void)
137 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
138 if (need_flush) { 137 if (need_flush) {
139 k8_flush_garts(); 138 k8_flush_garts();
140 need_flush = 0; 139 need_flush = false;
141 } 140 }
142 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
143} 142}
@@ -176,7 +175,8 @@ static void dump_leak(void)
176 iommu_leak_pages); 175 iommu_leak_pages);
177 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
178 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
179 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
181 } 181 }
182 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -215,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
215static inline int 215static inline int
216need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
217{ 217{
218 u64 mask = *dev->dma_mask; 218 return force_iommu ||
219 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
220 int mmu = high;
221
222 if (force_iommu)
223 mmu = 1;
224
225 return mmu;
226} 220}
227 221
228static inline int 222static inline int
229nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
230{ 224{
231 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
232 int high = addr + size > mask;
233 int mmu = high;
234
235 return mmu;
236} 226}
237 227
238/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
239 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
240 */ 230 */
241static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
242 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
243{ 233{
244 unsigned long npages = to_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
245 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
246 int i; 236 int i;
247 237
248 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -262,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
262 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
263} 253}
264 254
265static dma_addr_t
266gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
267{
268 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
269
270 flush_gart();
271
272 return map;
273}
274
275/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
276static dma_addr_t 256static dma_addr_t
277gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -279,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
279 unsigned long bus; 259 unsigned long bus;
280 260
281 if (!dev) 261 if (!dev)
282 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
283 263
284 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
285 return paddr; 265 return paddr;
286 266
287 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
288 269
289 return bus; 270 return bus;
290} 271}
@@ -304,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
304 return; 285 return;
305 286
306 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
307 npages = to_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size);
308 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
309 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
310 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -343,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
343 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
344 325
345 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
346 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
347 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
348 if (i > 0) 329 if (i > 0)
349 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -365,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
365 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
366 unsigned long pages) 347 unsigned long pages)
367{ 348{
368 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
369 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
370 struct scatterlist *s; 351 struct scatterlist *s;
371 int i; 352 int i;
@@ -387,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
387 } 368 }
388 369
389 addr = phys_addr; 370 addr = phys_addr;
390 pages = to_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length);
391 while (pages--) { 372 while (pages--) {
392 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
393 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -430,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
430 return 0; 411 return 0;
431 412
432 if (!dev) 413 if (!dev)
433 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
434 415
435 out = 0; 416 out = 0;
436 start = 0; 417 start = 0;
@@ -470,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
470 451
471 seg_size += s->length; 452 seg_size += s->length;
472 need = nextneed; 453 need = nextneed;
473 pages += to_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length);
474 ps = s; 455 ps = s;
475 } 456 }
476 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -502,6 +483,46 @@ error:
502 return 0; 483 return 0;
503} 484}
504 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
505static int no_agp; 526static int no_agp;
506 527
507static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -629,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
629 struct pci_dev *dev; 650 struct pci_dev *dev;
630 void *gatt; 651 void *gatt;
631 int i, error; 652 int i, error;
632 unsigned long start_pfn, end_pfn;
633 653
634 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
635 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -653,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
653 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
654 674
655 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
656 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
657 if (!gatt) 678 if (!gatt)
658 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
659 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
660 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
661 682
662 memset(gatt, 0, gatt_size);
663 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
664 684
665 enable_gart_translations(); 685 enable_gart_translations();
@@ -668,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
668 if (!error) 688 if (!error)
669 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
670 if (error) 690 if (error)
671 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
672 693
673 flush_gart(); 694 flush_gart();
674 695
675 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
676 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
677 698
678 /* need to map that range */
679 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
680 if (end_pfn > max_low_pfn_mapped) {
681 start_pfn = (aper_base>>PAGE_SHIFT);
682 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
683 }
684 return 0; 699 return 0;
685 700
686 nommu: 701 nommu:
@@ -690,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
690 return -1; 705 return -1;
691} 706}
692 707
693extern int agp_amd64_init(void); 708static struct dma_mapping_ops gart_dma_ops = {
694
695static const struct dma_mapping_ops gart_dma_ops = {
696 .mapping_error = NULL,
697 .map_single = gart_map_single, 709 .map_single = gart_map_single,
698 .map_simple = gart_map_simple,
699 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
700 .sync_single_for_cpu = NULL,
701 .sync_single_for_device = NULL,
702 .sync_single_range_for_cpu = NULL,
703 .sync_single_range_for_device = NULL,
704 .sync_sg_for_cpu = NULL,
705 .sync_sg_for_device = NULL,
706 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
707 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
708}; 715};
709 716
710void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -731,7 +738,8 @@ void __init gart_iommu_init(void)
731{ 738{
732 struct agp_kern_info info; 739 struct agp_kern_info info;
733 unsigned long iommu_start; 740 unsigned long iommu_start;
734 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
735 unsigned long scratch; 743 unsigned long scratch;
736 long i; 744 long i;
737 745
@@ -763,30 +771,35 @@ void __init gart_iommu_init(void)
763 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
764 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
765 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
766 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
767 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
768 } 776 }
769 return; 777 return;
770 } 778 }
771 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
772 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
773 aper_size = info.aper_size * 1024 * 1024;
774 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
775 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
776 792
777 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
778 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
779 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
780 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
781 memset(iommu_gart_bitmap, 0, iommu_pages/8);
782 797
783#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
784 if (leak_trace) { 799 if (leak_trace) {
785 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
786 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
787 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
788 memset(iommu_leak_tab, 0, iommu_pages * 8);
789 else
790 printk(KERN_DEBUG 803 printk(KERN_DEBUG
791 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
792 } 805 }
@@ -796,7 +809,7 @@ void __init gart_iommu_init(void)
796 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
797 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
798 */ 811 */
799 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
800 813
801 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
802 printk(KERN_INFO 815 printk(KERN_INFO
@@ -854,7 +867,8 @@ void __init gart_parse_options(char *p)
854 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
855 leak_trace = 1; 868 leak_trace = 1;
856 p += 4; 869 p += 4;
857 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
858 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
859 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
860 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff3..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76static int nommu_mapping_error(dma_addr_t dma_addr) 76 dma_addr_t dma_addr)
77{ 77{
78#ifdef CONFIG_X86_32 78 free_pages((unsigned long)vaddr, get_order(size));
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83} 79}
84 80
85 81struct dma_mapping_ops nommu_dma_ops = {
86const struct dma_mapping_ops nommu_dma_ops = { 82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
87 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 86 .is_phys = 1,
91}; 87};
92 88
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c20..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a0..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 184static void poll_idle(void)
186{ 185{
187 local_irq_enable(); 186 local_irq_enable();
188 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
189} 189}
190 190
191/* 191/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 246 return 1;
247} 247}
248 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
249/* 257/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 261 */
254static void c1e_idle(void) 262static void c1e_idle(void)
255{ 263{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 264 if (need_resched())
260 return; 265 return;
261 266
@@ -265,8 +270,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 272 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 277 }
271 } 278 }
272 279
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 53bc653ed5ca..922c14058f97 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,9 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
60#include <asm/syscalls.h>
61#include <asm/smp.h>
58 62
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 64
@@ -72,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
72 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
73} 77}
74 78
75#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
76#include <asm/nmi.h>
77
78static void cpu_exit_clear(void)
79{
80 int cpu = raw_smp_processor_id();
81
82 idle_task_exit();
83
84 cpu_uninit();
85 irq_ctx_exit(cpu);
86
87 cpu_clear(cpu, cpu_callout_map);
88 cpu_clear(cpu, cpu_callin_map);
89
90 numa_remove_cpu(cpu);
91}
92
93/* We don't actually take CPU down, just spin without interrupts. */
94static inline void play_dead(void)
95{
96 /* This must be done before dead CPU ack */
97 cpu_exit_clear();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 /*
104 * With physical CPU hotplug, we should halt the cpu
105 */
106 local_irq_disable();
107 while (1)
108 halt();
109}
110#else
111static inline void play_dead(void) 80static inline void play_dead(void)
112{ 81{
113 BUG(); 82 BUG();
114} 83}
115#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
116 85
117/* 86/*
118 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -160,6 +129,7 @@ void __show_registers(struct pt_regs *regs, int all)
160 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
161 unsigned long sp; 130 unsigned long sp;
162 unsigned short ss, gs; 131 unsigned short ss, gs;
132 const char *board;
163 133
164 if (user_mode_vm(regs)) { 134 if (user_mode_vm(regs)) {
165 sp = regs->sp; 135 sp = regs->sp;
@@ -172,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
172 } 142 }
173 143
174 printk("\n"); 144 printk("\n");
175 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 145
146 board = dmi_get_system_info(DMI_PRODUCT_NAME);
147 if (!board)
148 board = "";
149 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
176 task_pid_nr(current), current->comm, 150 task_pid_nr(current), current->comm,
177 print_tainted(), init_utsname()->release, 151 print_tainted(), init_utsname()->release,
178 (int)strcspn(init_utsname()->version, " "), 152 (int)strcspn(init_utsname()->version, " "),
179 init_utsname()->version); 153 init_utsname()->version, board);
180 154
181 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 155 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
182 (u16)regs->cs, regs->ip, regs->flags, 156 (u16)regs->cs, regs->ip, regs->flags,
@@ -276,6 +250,14 @@ void exit_thread(void)
276 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
277 put_cpu(); 251 put_cpu();
278 } 252 }
253#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
279} 261}
280 262
281void flush_thread(void) 263void flush_thread(void)
@@ -437,6 +419,35 @@ int set_tsc_mode(unsigned int val)
437 return 0; 419 return 0;
438} 420}
439 421
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
440static noinline void 451static noinline void
441__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
442 struct tss_struct *tss) 453 struct tss_struct *tss)
@@ -447,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
447 prev = &prev_p->thread; 458 prev = &prev_p->thread;
448 next = &next_p->thread; 459 next = &next_p->thread;
449 460
450 debugctl = prev->debugctlmsr; 461 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
451 if (next->ds_area_msr != prev->ds_area_msr) {
452 /* we clear debugctl to make sure DS
453 * is not in use when we change it */
454 debugctl = 0;
455 update_debugctlmsr(0);
456 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
457 }
458 462
459 if (next->debugctlmsr != debugctl) 463 if (next->debugctlmsr != debugctl)
460 update_debugctlmsr(next->debugctlmsr); 464 update_debugctlmsr(next->debugctlmsr);
@@ -478,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
478 hard_enable_TSC(); 482 hard_enable_TSC();
479 } 483 }
480 484
481#ifdef X86_BTS 485#ifdef CONFIG_X86_PTRACE_BTS
482 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
483 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
484 488
485 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
486 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
487#endif 491#endif /* CONFIG_X86_PTRACE_BTS */
488 492
489 493
490 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3fb62a7d9a16..ca80394ef5b8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
51#include <asm/proto.h> 51#include <asm/proto.h>
52#include <asm/ia32.h> 52#include <asm/ia32.h>
53#include <asm/idle.h> 53#include <asm/idle.h>
54#include <asm/syscalls.h>
54 55
55asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
56 57
@@ -85,29 +86,12 @@ void exit_idle(void)
85 __exit_idle(); 86 __exit_idle();
86} 87}
87 88
88#ifdef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_SMP
89DECLARE_PER_CPU(int, cpu_state);
90
91#include <asm/nmi.h>
92/* We halt the CPU with physical CPU hotplug */
93static inline void play_dead(void)
94{
95 idle_task_exit();
96 wbinvd();
97 mb();
98 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD;
100
101 local_irq_disable();
102 while (1)
103 halt();
104}
105#else
106static inline void play_dead(void) 90static inline void play_dead(void)
107{ 91{
108 BUG(); 92 BUG();
109} 93}
110#endif /* CONFIG_HOTPLUG_CPU */ 94#endif
111 95
112/* 96/*
113 * The idle thread. There's no useful work to be 97 * The idle thread. There's no useful work to be
@@ -152,7 +136,7 @@ void cpu_idle(void)
152} 136}
153 137
154/* Prints also some state that isn't saved in the pt_regs */ 138/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs * regs) 139void __show_regs(struct pt_regs *regs)
156{ 140{
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 141 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7; 142 unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,59 +145,61 @@ void __show_regs(struct pt_regs * regs)
161 145
162 printk("\n"); 146 printk("\n");
163 print_modules(); 147 print_modules();
164 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 148 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(), 149 current->pid, current->comm, print_tainted(),
166 init_utsname()->release, 150 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "), 151 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version); 152 init_utsname()->version);
169 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 153 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1); 154 printk_address(regs->ip, 1);
171 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 155 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->flags); 156 regs->sp, regs->flags);
173 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 157 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx); 158 regs->ax, regs->bx, regs->cx);
175 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 159 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di); 160 regs->dx, regs->si, regs->di);
177 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 161 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9); 162 regs->bp, regs->r8, regs->r9);
179 printk("R10: %016lx R11: %016lx R12: %016lx\n", 163 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12); 164 regs->r10, regs->r11, regs->r12);
181 printk("R13: %016lx R14: %016lx R15: %016lx\n", 165 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15); 166 regs->r13, regs->r14, regs->r15);
183 167
184 asm("movl %%ds,%0" : "=r" (ds)); 168 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs)); 169 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es)); 170 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex)); 171 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex)); 172 asm("movl %%gs,%0" : "=r" (gsindex));
189 173
190 rdmsrl(MSR_FS_BASE, fs); 174 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs); 175 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 176 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
193 177
194 cr0 = read_cr0(); 178 cr0 = read_cr0();
195 cr2 = read_cr2(); 179 cr2 = read_cr2();
196 cr3 = read_cr3(); 180 cr3 = read_cr3();
197 cr4 = read_cr4(); 181 cr4 = read_cr4();
198 182
199 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 183 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs,fsindex,gs,gsindex,shadowgs); 184 fs, fsindex, gs, gsindex, shadowgs);
201 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 185 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 186 es, cr0);
187 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
188 cr4);
203 189
204 get_debugreg(d0, 0); 190 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 191 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 192 get_debugreg(d2, 2);
207 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 193 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 194 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 195 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 196 get_debugreg(d7, 7);
211 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 197 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 198}
213 199
214void show_regs(struct pt_regs *regs) 200void show_regs(struct pt_regs *regs)
215{ 201{
216 printk("CPU %d:", smp_processor_id()); 202 printk(KERN_INFO "CPU %d:", smp_processor_id());
217 __show_regs(regs); 203 __show_regs(regs);
218 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 204 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
219} 205}
@@ -239,6 +225,14 @@ void exit_thread(void)
239 t->io_bitmap_max = 0; 225 t->io_bitmap_max = 0;
240 put_cpu(); 226 put_cpu();
241 } 227 }
228#ifdef CONFIG_X86_DS
229 /* Free any DS contexts that have not been properly released. */
230 if (unlikely(t->ds_ctx)) {
231 /* we clear debugctl to make sure DS is not used. */
232 update_debugctlmsr(0);
233 ds_free(t->ds_ctx);
234 }
235#endif /* CONFIG_X86_DS */
242} 236}
243 237
244void flush_thread(void) 238void flush_thread(void)
@@ -314,10 +308,10 @@ void prepare_to_copy(struct task_struct *tsk)
314 308
315int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 309int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
316 unsigned long unused, 310 unsigned long unused,
317 struct task_struct * p, struct pt_regs * regs) 311 struct task_struct *p, struct pt_regs *regs)
318{ 312{
319 int err; 313 int err;
320 struct pt_regs * childregs; 314 struct pt_regs *childregs;
321 struct task_struct *me = current; 315 struct task_struct *me = current;
322 316
323 childregs = ((struct pt_regs *) 317 childregs = ((struct pt_regs *)
@@ -362,10 +356,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
362 if (test_thread_flag(TIF_IA32)) 356 if (test_thread_flag(TIF_IA32))
363 err = do_set_thread_area(p, -1, 357 err = do_set_thread_area(p, -1,
364 (struct user_desc __user *)childregs->si, 0); 358 (struct user_desc __user *)childregs->si, 0);
365 else 359 else
366#endif 360#endif
367 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 361 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
368 if (err) 362 if (err)
369 goto out; 363 goto out;
370 } 364 }
371 err = 0; 365 err = 0;
@@ -472,13 +466,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
472 next = &next_p->thread; 466 next = &next_p->thread;
473 467
474 debugctl = prev->debugctlmsr; 468 debugctl = prev->debugctlmsr;
475 if (next->ds_area_msr != prev->ds_area_msr) { 469
476 /* we clear debugctl to make sure DS 470#ifdef CONFIG_X86_DS
477 * is not in use when we change it */ 471 {
478 debugctl = 0; 472 unsigned long ds_prev = 0, ds_next = 0;
479 update_debugctlmsr(0); 473
480 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 474 if (prev->ds_ctx)
475 ds_prev = (unsigned long)prev->ds_ctx->ds;
476 if (next->ds_ctx)
477 ds_next = (unsigned long)next->ds_ctx->ds;
478
479 if (ds_next != ds_prev) {
480 /*
481 * We clear debugctl to make sure DS
482 * is not in use when we change it:
483 */
484 debugctl = 0;
485 update_debugctlmsr(0);
486 wrmsrl(MSR_IA32_DS_AREA, ds_next);
487 }
481 } 488 }
489#endif /* CONFIG_X86_DS */
482 490
483 if (next->debugctlmsr != debugctl) 491 if (next->debugctlmsr != debugctl)
484 update_debugctlmsr(next->debugctlmsr); 492 update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +524,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
516 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 524 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
517 } 525 }
518 526
519#ifdef X86_BTS 527#ifdef CONFIG_X86_PTRACE_BTS
520 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 528 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
521 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 529 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
522 530
523 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 531 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
524 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 532 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
525#endif 533#endif /* CONFIG_X86_PTRACE_BTS */
526} 534}
527 535
528/* 536/*
@@ -544,7 +552,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
544 unsigned fsindex, gsindex; 552 unsigned fsindex, gsindex;
545 553
546 /* we're going to use this soon, after a few expensive things */ 554 /* we're going to use this soon, after a few expensive things */
547 if (next_p->fpu_counter>5) 555 if (next_p->fpu_counter > 5)
548 prefetch(next->xstate); 556 prefetch(next->xstate);
549 557
550 /* 558 /*
@@ -552,13 +560,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 */ 560 */
553 load_sp0(tss, next); 561 load_sp0(tss, next);
554 562
555 /* 563 /*
556 * Switch DS and ES. 564 * Switch DS and ES.
557 * This won't pick up thread selector changes, but I guess that is ok. 565 * This won't pick up thread selector changes, but I guess that is ok.
558 */ 566 */
559 savesegment(es, prev->es); 567 savesegment(es, prev->es);
560 if (unlikely(next->es | prev->es)) 568 if (unlikely(next->es | prev->es))
561 loadsegment(es, next->es); 569 loadsegment(es, next->es);
562 570
563 savesegment(ds, prev->ds); 571 savesegment(ds, prev->ds);
564 if (unlikely(next->ds | prev->ds)) 572 if (unlikely(next->ds | prev->ds))
@@ -584,7 +592,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 */ 592 */
585 arch_leave_lazy_cpu_mode(); 593 arch_leave_lazy_cpu_mode();
586 594
587 /* 595 /*
588 * Switch FS and GS. 596 * Switch FS and GS.
589 * 597 *
590 * Segment register != 0 always requires a reload. Also 598 * Segment register != 0 always requires a reload. Also
@@ -593,13 +601,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
593 */ 601 */
594 if (unlikely(fsindex | next->fsindex | prev->fs)) { 602 if (unlikely(fsindex | next->fsindex | prev->fs)) {
595 loadsegment(fs, next->fsindex); 603 loadsegment(fs, next->fsindex);
596 /* 604 /*
597 * Check if the user used a selector != 0; if yes 605 * Check if the user used a selector != 0; if yes
598 * clear 64bit base, since overloaded base is always 606 * clear 64bit base, since overloaded base is always
599 * mapped to the Null selector 607 * mapped to the Null selector
600 */ 608 */
601 if (fsindex) 609 if (fsindex)
602 prev->fs = 0; 610 prev->fs = 0;
603 } 611 }
604 /* when next process has a 64bit base use it */ 612 /* when next process has a 64bit base use it */
605 if (next->fs) 613 if (next->fs)
@@ -609,7 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 617 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 618 load_gs_index(next->gsindex);
611 if (gsindex) 619 if (gsindex)
612 prev->gs = 0; 620 prev->gs = 0;
613 } 621 }
614 if (next->gs) 622 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 623 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -618,12 +626,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* Must be after DS reload */ 626 /* Must be after DS reload */
619 unlazy_fpu(prev_p); 627 unlazy_fpu(prev_p);
620 628
621 /* 629 /*
622 * Switch the PDA and FPU contexts. 630 * Switch the PDA and FPU contexts.
623 */ 631 */
624 prev->usersp = read_pda(oldrsp); 632 prev->usersp = read_pda(oldrsp);
625 write_pda(oldrsp, next->usersp); 633 write_pda(oldrsp, next->usersp);
626 write_pda(pcurrent, next_p); 634 write_pda(pcurrent, next_p);
627 635
628 write_pda(kernelstack, 636 write_pda(kernelstack,
629 (unsigned long)task_stack_page(next_p) + 637 (unsigned long)task_stack_page(next_p) +
@@ -664,7 +672,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
664 char __user * __user *envp, struct pt_regs *regs) 672 char __user * __user *envp, struct pt_regs *regs)
665{ 673{
666 long error; 674 long error;
667 char * filename; 675 char *filename;
668 676
669 filename = getname(name); 677 filename = getname(name);
670 error = PTR_ERR(filename); 678 error = PTR_ERR(filename);
@@ -722,55 +730,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
722unsigned long get_wchan(struct task_struct *p) 730unsigned long get_wchan(struct task_struct *p)
723{ 731{
724 unsigned long stack; 732 unsigned long stack;
725 u64 fp,ip; 733 u64 fp, ip;
726 int count = 0; 734 int count = 0;
727 735
728 if (!p || p == current || p->state==TASK_RUNNING) 736 if (!p || p == current || p->state == TASK_RUNNING)
729 return 0; 737 return 0;
730 stack = (unsigned long)task_stack_page(p); 738 stack = (unsigned long)task_stack_page(p);
731 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 739 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
732 return 0; 740 return 0;
733 fp = *(u64 *)(p->thread.sp); 741 fp = *(u64 *)(p->thread.sp);
734 do { 742 do {
735 if (fp < (unsigned long)stack || 743 if (fp < (unsigned long)stack ||
736 fp > (unsigned long)stack+THREAD_SIZE) 744 fp >= (unsigned long)stack+THREAD_SIZE)
737 return 0; 745 return 0;
738 ip = *(u64 *)(fp+8); 746 ip = *(u64 *)(fp+8);
739 if (!in_sched_functions(ip)) 747 if (!in_sched_functions(ip))
740 return ip; 748 return ip;
741 fp = *(u64 *)fp; 749 fp = *(u64 *)fp;
742 } while (count++ < 16); 750 } while (count++ < 16);
743 return 0; 751 return 0;
744} 752}
745 753
746long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 754long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747{ 755{
748 int ret = 0; 756 int ret = 0;
749 int doit = task == current; 757 int doit = task == current;
750 int cpu; 758 int cpu;
751 759
752 switch (code) { 760 switch (code) {
753 case ARCH_SET_GS: 761 case ARCH_SET_GS:
754 if (addr >= TASK_SIZE_OF(task)) 762 if (addr >= TASK_SIZE_OF(task))
755 return -EPERM; 763 return -EPERM;
756 cpu = get_cpu(); 764 cpu = get_cpu();
757 /* handle small bases via the GDT because that's faster to 765 /* handle small bases via the GDT because that's faster to
758 switch. */ 766 switch. */
759 if (addr <= 0xffffffff) { 767 if (addr <= 0xffffffff) {
760 set_32bit_tls(task, GS_TLS, addr); 768 set_32bit_tls(task, GS_TLS, addr);
761 if (doit) { 769 if (doit) {
762 load_TLS(&task->thread, cpu); 770 load_TLS(&task->thread, cpu);
763 load_gs_index(GS_TLS_SEL); 771 load_gs_index(GS_TLS_SEL);
764 } 772 }
765 task->thread.gsindex = GS_TLS_SEL; 773 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gs = 0; 774 task->thread.gs = 0;
767 } else { 775 } else {
768 task->thread.gsindex = 0; 776 task->thread.gsindex = 0;
769 task->thread.gs = addr; 777 task->thread.gs = addr;
770 if (doit) { 778 if (doit) {
771 load_gs_index(0); 779 load_gs_index(0);
772 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 780 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
773 } 781 }
774 } 782 }
775 put_cpu(); 783 put_cpu();
776 break; 784 break;
@@ -824,8 +832,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 rdmsrl(MSR_KERNEL_GS_BASE, base); 832 rdmsrl(MSR_KERNEL_GS_BASE, base);
825 else 833 else
826 base = task->thread.gs; 834 base = task->thread.gs;
827 } 835 } else
828 else
829 base = task->thread.gs; 836 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr); 837 ret = put_user(base, (unsigned long __user *)addr);
831 break; 838 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e37dccce85db..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
39 REGSET_GENERAL, 40 REGSET_GENERAL,
40 REGSET_FP, 41 REGSET_FP,
41 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
42 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
43}; 46};
44 47
45/* 48/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
69 72
70#define FLAG_MASK FLAG_MASK_32 73#define FLAG_MASK FLAG_MASK_32
71 74
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{ 76{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2; 78 regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 557 return 0;
555} 558}
556 559
557#ifdef X86_BTS 560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
558 569
559static int ptrace_bts_get_size(struct task_struct *child) 570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
560{ 574{
561 if (!child->thread.ds_area_msr) 575 if (!target->thread.io_bitmap_ptr)
562 return -ENXIO; 576 return -ENXIO;
563 577
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
565} 661}
566 662
567static int ptrace_bts_read_record(struct task_struct *child, 663static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 664 struct bts_struct __user *out)
570{ 665{
571 struct bts_struct ret; 666 struct bts_struct ret;
572 int retval; 667 const void *bts_record;
573 int bts_end; 668 size_t bts_index, bts_end;
574 int bts_index; 669 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 670
579 if (index < 0) 671 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 672 if (error < 0)
673 return error;
581 674
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 675 if (bts_end <= index)
584 return -EINVAL; 676 return -EINVAL;
585 677
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681
586 /* translate the ptrace bts index into the ds bts index */ 682 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 683 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 684 if (bts_end <= bts_index)
589 if (bts_index < 0) 685 bts_index -= bts_end;
590 bts_index += bts_end; 686
687 error = ds_access_bts(child, bts_index, &bts_record);
688 if (error < 0)
689 return error;
591 690
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 691 ptrace_bts_translate_record(&ret, bts_record);
593 bts_index, &ret);
594 if (retval < 0)
595 return retval;
596 692
597 if (copy_to_user(out, &ret, sizeof(ret))) 693 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 694 return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 696 return sizeof(ret);
601} 697}
602 698
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 699static int ptrace_bts_drain(struct task_struct *child,
612 long size, 700 long size,
613 struct bts_struct __user *out) 701 struct bts_struct __user *out)
614{ 702{
615 int end, i; 703 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 704 const unsigned char *raw;
617 705 size_t end, i;
618 if (!ds) 706 int error;
619 return -ENXIO;
620 707
621 end = ds_get_bts_index(ds); 708 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 709 if (error < 0)
623 return end; 710 return error;
624 711
625 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 713 return -EIO;
627 714
628 for (i = 0; i < end; i++, out++) { 715 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 716 if (error < 0)
630 int retval; 717 return error;
631 718
632 retval = ds_read_bts(ds, i, &ret); 719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 720 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 721
636 if (copy_to_user(out, &ret, sizeof(ret))) 722 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 723 return -EFAULT;
638 } 724 }
639 725
640 ds_clear(ds); 726 error = ds_clear_bts(child);
727 if (error < 0)
728 return error;
641 729
642 return end; 730 return end;
643} 731}
644 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
645static int ptrace_bts_config(struct task_struct *child, 738static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 739 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 740 const struct ptrace_bts_config __user *ucfg)
648{ 741{
649 struct ptrace_bts_config cfg; 742 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 743 int error = 0;
651 void *ds;
652 744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748
749 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 750 if (cfg_size < sizeof(cfg))
654 return -EIO; 751 goto errout;
655 752
753 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 754 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 755 goto errout;
658 756
659 if ((int)cfg.size < 0) 757 error = -EINVAL;
660 return -EINVAL; 758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
661 761
662 bts_size = 0; 762 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 763 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 764 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 765
666 if (bts_size < 0) 766 /* we ignore the error in case we were not tracing child */
667 return bts_size; 767 (void)ds_release_bts(child);
668 } 768
669 cfg.size = PAGE_ALIGN(cfg.size); 769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal)
771 goto errout;
772
773 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 }
670 776
671 if (bts_size != cfg.size) { 777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
672 ret = ptrace_bts_realloc(child, cfg.size, 778 if (error < 0)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
674 if (ret < 0)
675 goto errout; 779 goto errout;
676 780
677 ds = (void *)child->thread.ds_area_msr; 781 child->thread.bts_ovfl_signal = sig;
678 } 782 }
679 783
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 784 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 785 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 786 goto errout;
686 787
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 788 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 790 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 792
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 793 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 795 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 797
697 ret = sizeof(cfg); 798 error = sizeof(cfg);
698 799
699out: 800out:
700 if (child->thread.debugctlmsr) 801 if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
702 else 803 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 805
705 return ret; 806 return error;
706 807
707errout: 808errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 811 goto out;
711} 812}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 815 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 816 struct ptrace_bts_config __user *ucfg)
716{ 817{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 818 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
719 822
720 if (cfg_size < sizeof(cfg)) 823 if (cfg_size < sizeof(cfg))
721 return -EIO; 824 return -EIO;
722 825
723 memset(&cfg, 0, sizeof(cfg)); 826 error = ds_get_bts_end(child, &end);
827 if (error < 0)
828 return error;
724 829
725 if (ds) { 830 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 831 if (error < 0)
832 return error;
727 833
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 834 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 835 if (error < 0)
836 return error;
730 837
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 838 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 839 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 840 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct);
734 842
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 843 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 844 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 845
739 cfg.bts_size = sizeof(struct bts_struct); 846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE;
849
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
851 cfg.flags |= PTRACE_BTS_O_SCHED;
740 852
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 853 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 854 return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 856 return sizeof(cfg);
745} 857}
746 858
747
748static int ptrace_bts_write_record(struct task_struct *child, 859static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 860 const struct bts_struct *in)
750{ 861{
751 int retval; 862 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 863
753 if (!child->thread.ds_area_msr) 864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 865
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 866 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 867 switch (in->qualifier) {
758 return retval; 868 case BTS_INVALID:
869 break;
759 870
760 return sizeof(*in); 871 case BTS_BRANCH:
761} 872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
762 875
763static int ptrace_bts_realloc(struct task_struct *child, 876 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 877 case BTS_TASK_DEPARTS:
765{ 878 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 879 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
768 882
769 if (size < 0) 883 default:
770 return -EINVAL; 884 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 }
801
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
803 vm = current->mm->locked_vm + size;
804 if (rlim < vm) {
805 ret = -ENOMEM;
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 } 885 }
814 886
815 ret = ds_allocate((void **)&child->thread.ds_area_msr, 887 /* The writing task will be the switched-to task on a context
816 size << PAGE_SHIFT); 888 * switch. It needs to write into the switched-from task's BTS
817 if (ret < 0) 889 * buffer. */
818 goto out; 890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 891}
831 892
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 893void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 900
840 ptrace_bts_write_record(tsk, &rec); 901 ptrace_bts_write_record(tsk, &rec);
841} 902}
842#endif /* X86_BTS */ 903
904static const struct bts_configuration bts_cfg_netburst = {
905 .sizeof_bts = sizeof(long) * 3,
906 .sizeof_field = sizeof(long),
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909
910static const struct bts_configuration bts_cfg_pentium_m = {
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961}
962#endif /* CONFIG_X86_PTRACE_BTS */
843 963
844/* 964/*
845 * Called by kernel/ptrace.c when detaching.. 965 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 972#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 974#endif
855 if (child->thread.ds_area_msr) { 975#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 976 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 977
858#endif 978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 979 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 981
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 983#endif /* CONFIG_X86_PTRACE_BTS */
864} 984}
865 985
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1100 /*
981 * These bits need more cooking - not enabled yet: 1101 * These bits need more cooking - not enabled yet:
982 */ 1102 */
983#ifdef X86_BTS 1103#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1104 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1105 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1106 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1112 break;
993 1113
994 case PTRACE_BTS_SIZE: 1114 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1115 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1116 break;
997 1117
998 case PTRACE_BTS_GET: 1118 case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1121 break;
1002 1122
1003 case PTRACE_BTS_CLEAR: 1123 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1124 ret = ds_clear_bts(child);
1005 break; 1125 break;
1006 1126
1007 case PTRACE_BTS_DRAIN: 1127 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1128 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1129 (child, data, (struct bts_struct __user *) addr);
1010 break; 1130 break;
1011#endif 1131#endif /* CONFIG_X86_PTRACE_BTS */
1012 1132
1013 default: 1133 default:
1014 ret = ptrace_request(child, request, addr, data); 1134 ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1290 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1291 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1292 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1293}; 1419};
1294 1420
1295static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1336 .active = regset_tls_active, 1462 .active = regset_tls_active,
1337 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1338 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1339}; 1471};
1340 1472
1341static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1357,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1489#endif
1358} 1490}
1359 1491
1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1493 int error_code, int si_code)
1361{ 1494{
1362 struct siginfo info; 1495 struct siginfo info;
1363 1496
@@ -1366,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1366 1499
1367 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1368 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1369 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1370 1503
1371 /* User-mode ip? */ 1504 /* User-mode ip? */
1372 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1375,30 +1508,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1375 force_sig_info(SIGTRAP, &info, tsk); 1508 force_sig_info(SIGTRAP, &info, tsk);
1376} 1509}
1377 1510
1378static void syscall_trace(struct pt_regs *regs)
1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1382
1383#if 0
1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1385 current->comm,
1386 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1387 current_thread_info()->flags, current->ptrace);
1388#endif
1389
1390 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1391 ? 0x80 : 0));
1392 /*
1393 * this isn't the same as continuing with a signal, but it will do
1394 * for normal use. strace only continues with a signal if the
1395 * stopping signal is not SIGTRAP. -brl
1396 */
1397 if (current->exit_code) {
1398 send_sig(current->exit_code, current, 1);
1399 current->exit_code = 0;
1400 }
1401}
1402 1511
1403#ifdef CONFIG_X86_32 1512#ifdef CONFIG_X86_32
1404# define IS_IA32 1 1513# define IS_IA32 1
@@ -1432,8 +1541,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1541 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1433 ret = -1L; 1542 ret = -1L;
1434 1543
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) 1544 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1436 syscall_trace(regs); 1545 tracehook_report_syscall_entry(regs))
1546 ret = -1L;
1437 1547
1438 if (unlikely(current->audit_context)) { 1548 if (unlikely(current->audit_context)) {
1439 if (IS_IA32) 1549 if (IS_IA32)
@@ -1459,7 +1569,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1569 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1460 1570
1461 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1571 if (test_thread_flag(TIF_SYSCALL_TRACE))
1462 syscall_trace(regs); 1572 tracehook_report_syscall_exit(regs, 0);
1463 1573
1464 /* 1574 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of 1575 * If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1585,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1475 * system call instruction. 1585 * system call instruction.
1476 */ 1586 */
1477 if (test_thread_flag(TIF_SINGLESTEP) && 1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED)) 1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1479 send_sigtrap(current, regs, 0); 1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1480} 1590}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 06a9f643817e..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -414,25 +418,20 @@ void native_machine_shutdown(void)
414 418
415 /* The boot cpu is always logical cpu 0 */ 419 /* The boot cpu is always logical cpu 0 */
416 int reboot_cpu_id = 0; 420 int reboot_cpu_id = 0;
417 cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
418 421
419#ifdef CONFIG_X86_32 422#ifdef CONFIG_X86_32
420 /* See if there has been given a command line override */ 423 /* See if there has been given a command line override */
421 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 424 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
422 cpu_online(reboot_cpu)) { 425 cpu_online(reboot_cpu))
423 reboot_cpu_id = reboot_cpu; 426 reboot_cpu_id = reboot_cpu;
424 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
425 }
426#endif 427#endif
427 428
428 /* Make certain the cpu I'm about to reboot on is online */ 429 /* Make certain the cpu I'm about to reboot on is online */
429 if (!cpu_online(reboot_cpu_id)) { 430 if (!cpu_online(reboot_cpu_id))
430 reboot_cpu_id = smp_processor_id(); 431 reboot_cpu_id = smp_processor_id();
431 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
432 }
433 432
434 /* Make certain I only run on the appropriate processor */ 433 /* Make certain I only run on the appropriate processor */
435 set_cpus_allowed_ptr(current, newmask); 434 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
436 435
437 /* O.K Now that I'm on the appropriate processor, 436 /* O.K Now that I'm on the appropriate processor,
438 * stop all of the others. 437 * stop all of the others.
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define ESP DATA(0x0)
31#define CR0 DATA(0x4)
32#define CR3 DATA(0x8)
33#define CR4 DATA(0xc)
34
35/* other data */
36#define CP_VA_CONTROL_PAGE DATA(0x10)
37#define CP_PA_PGD DATA(0x14)
38#define CP_PA_SWAP_PAGE DATA(0x18)
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40
23 .text 41 .text
24 .align PAGE_SIZE 42 .align PAGE_SIZE
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 45 /* Save the CPU context, used for jumping back */
46
47 pushl %ebx
48 pushl %esi
49 pushl %edi
50 pushl %ebp
51 pushf
52
53 movl 20+8(%esp), %ebp /* list of pages */
54 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
55 movl %esp, ESP(%edi)
56 movl %cr0, %eax
57 movl %eax, CR0(%edi)
58 movl %cr3, %eax
59 movl %eax, CR3(%edi)
60 movl %cr4, %eax
61 movl %eax, CR4(%edi)
28 62
29#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 64 /* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
138 172
139relocate_new_kernel: 173relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 174 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 175 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 176 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 177 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 178 movl 20+16(%esp), %ecx /* cpu_has_pae */
179 movl 20+20(%esp), %esi /* preserve_context */
145 180
146 /* zero out flags, and disable interrupts */ 181 /* zero out flags, and disable interrupts */
147 pushl $0 182 pushl $0
148 popfl 183 popfl
149 184
185 /* save some information for jumping back */
186 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
187 movl %edi, CP_VA_CONTROL_PAGE(%edi)
188 movl PTR(PA_PGD)(%ebp), %eax
189 movl %eax, CP_PA_PGD(%edi)
190 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
191 movl %eax, CP_PA_SWAP_PAGE(%edi)
192 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
193
150 /* get physical address of control page now */ 194 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 195 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 196 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
197 xorl %eax, %eax 241 xorl %eax, %eax
198 movl %eax, %cr3 242 movl %eax, %cr3
199 243
244 movl CP_PA_SWAP_PAGE(%edi), %eax
245 pushl %eax
246 pushl %ebx
247 call swap_pages
248 addl $8, %esp
249
250 /* To be certain of avoiding problems with self-modifying code
251 * I need to execute a serializing instruction here.
252 * So I flush the TLB, it's handy, and not processor dependent.
253 */
254 xorl %eax, %eax
255 movl %eax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %esp alone */
259
260 testl %esi, %esi
261 jnz 1f
262 xorl %edi, %edi
263 xorl %eax, %eax
264 xorl %ebx, %ebx
265 xorl %ecx, %ecx
266 xorl %edx, %edx
267 xorl %esi, %esi
268 xorl %ebp, %ebp
269 ret
2701:
271 popl %edx
272 movl CP_PA_SWAP_PAGE(%edi), %esp
273 addl $PAGE_SIZE, %esp
2742:
275 call *%edx
276
277 /* get the re-entry point of the peer system */
278 movl 0(%esp), %ebp
279 call 1f
2801:
281 popl %ebx
282 subl $(1b - relocate_kernel), %ebx
283 movl CP_VA_CONTROL_PAGE(%ebx), %edi
284 lea PAGE_SIZE(%ebx), %esp
285 movl CP_PA_SWAP_PAGE(%ebx), %eax
286 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
287 pushl %eax
288 pushl %edx
289 call swap_pages
290 addl $8, %esp
291 movl CP_PA_PGD(%ebx), %eax
292 movl %eax, %cr3
293 movl %cr0, %eax
294 orl $(1<<31), %eax
295 movl %eax, %cr0
296 lea PAGE_SIZE(%edi), %esp
297 movl %edi, %eax
298 addl $(virtual_mapped - relocate_kernel), %eax
299 pushl %eax
300 ret
301
302virtual_mapped:
303 movl CR4(%edi), %eax
304 movl %eax, %cr4
305 movl CR3(%edi), %eax
306 movl %eax, %cr3
307 movl CR0(%edi), %eax
308 movl %eax, %cr0
309 movl ESP(%edi), %esp
310 movl %ebp, %eax
311
312 popf
313 popl %ebp
314 popl %edi
315 popl %esi
316 popl %ebx
317 ret
318
200 /* Do the copies */ 319 /* Do the copies */
201 movl %ebx, %ecx 320swap_pages:
321 movl 8(%esp), %edx
322 movl 4(%esp), %ecx
323 pushl %ebp
324 pushl %ebx
325 pushl %edi
326 pushl %esi
327 movl %ecx, %ebx
202 jmp 1f 328 jmp 1f
203 329
2040: /* top, read another word from the indirection page */ 3300: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 352 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 353 andl $0xfffff000, %esi
228 354
355 movl %edi, %eax
356 movl %esi, %ebp
357
358 movl %edx, %edi
229 movl $1024, %ecx 359 movl $1024, %ecx
230 rep ; movsl 360 rep ; movsl
231 jmp 0b
232
2333:
234 361
235 /* To be certain of avoiding problems with self-modifying code 362 movl %ebp, %edi
236 * I need to execute a serializing instruction here. 363 movl %eax, %esi
237 * So I flush the TLB, it's handy, and not processor dependent. 364 movl $1024, %ecx
238 */ 365 rep ; movsl
239 xorl %eax, %eax
240 movl %eax, %cr3
241 366
242 /* set all of the registers to known values */ 367 movl %eax, %edi
243 /* leave %esp alone */ 368 movl %edx, %esi
369 movl $1024, %ecx
370 rep ; movsl
244 371
245 xorl %eax, %eax 372 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 373 jmp 0b
247 xorl %ecx, %ecx 3743:
248 xorl %edx, %edx 375 popl %esi
249 xorl %esi, %esi 376 popl %edi
250 xorl %edi, %edi 377 popl %ebx
251 xorl %ebp, %ebp 378 popl %ebp
252 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b520dae02bf4..21b8e0a59780 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -445,7 +448,7 @@ static void __init reserve_early_setup_data(void)
445 * @size: Size of the crashkernel memory to reserve. 448 * @size: Size of the crashkernel memory to reserve.
446 * Returns the base address on success, and -1ULL on failure. 449 * Returns the base address on success, and -1ULL on failure.
447 */ 450 */
448unsigned long long find_and_reserve_crashkernel(unsigned long long size) 451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
449{ 452{
450 const unsigned long long alignment = 16<<20; /* 16M */ 453 const unsigned long long alignment = 16<<20; /* 16M */
451 unsigned long long start = 0LL; 454 unsigned long long start = 0LL;
@@ -579,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata;
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580 583
581/* 584/*
585 * Some BIOSes seem to corrupt the low 64k of memory during events
586 * like suspend/resume and unplugging an HDMI cable. Reserve all
587 * remaining free memory in that area and fill it with a distinct
588 * pattern.
589 */
590#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
591#define MAX_SCAN_AREAS 8
592
593static int __read_mostly memory_corruption_check = -1;
594
595static unsigned __read_mostly corruption_check_size = 64*1024;
596static unsigned __read_mostly corruption_check_period = 60; /* seconds */
597
598static struct e820entry scan_areas[MAX_SCAN_AREAS];
599static int num_scan_areas;
600
601
602static int set_corruption_check(char *arg)
603{
604 char *end;
605
606 memory_corruption_check = simple_strtol(arg, &end, 10);
607
608 return (*end == 0) ? 0 : -EINVAL;
609}
610early_param("memory_corruption_check", set_corruption_check);
611
612static int set_corruption_check_period(char *arg)
613{
614 char *end;
615
616 corruption_check_period = simple_strtoul(arg, &end, 10);
617
618 return (*end == 0) ? 0 : -EINVAL;
619}
620early_param("memory_corruption_check_period", set_corruption_check_period);
621
622static int set_corruption_check_size(char *arg)
623{
624 char *end;
625 unsigned size;
626
627 size = memparse(arg, &end);
628
629 if (*end == '\0')
630 corruption_check_size = size;
631
632 return (size == corruption_check_size) ? 0 : -EINVAL;
633}
634early_param("memory_corruption_check_size", set_corruption_check_size);
635
636
637static void __init setup_bios_corruption_check(void)
638{
639 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
640
641 if (memory_corruption_check == -1) {
642 memory_corruption_check =
643#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
644 1
645#else
646 0
647#endif
648 ;
649 }
650
651 if (corruption_check_size == 0)
652 memory_corruption_check = 0;
653
654 if (!memory_corruption_check)
655 return;
656
657 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
658
659 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
660 u64 size;
661 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
662
663 if (addr == 0)
664 break;
665
666 if ((addr + size) > corruption_check_size)
667 size = corruption_check_size - addr;
668
669 if (size == 0)
670 break;
671
672 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
673 scan_areas[num_scan_areas].addr = addr;
674 scan_areas[num_scan_areas].size = size;
675 num_scan_areas++;
676
677 /* Assume we've already mapped this early memory */
678 memset(__va(addr), 0, size);
679
680 addr += size;
681 }
682
683 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
684 num_scan_areas);
685 update_e820();
686}
687
688static struct timer_list periodic_check_timer;
689
690void check_for_bios_corruption(void)
691{
692 int i;
693 int corruption = 0;
694
695 if (!memory_corruption_check)
696 return;
697
698 for(i = 0; i < num_scan_areas; i++) {
699 unsigned long *addr = __va(scan_areas[i].addr);
700 unsigned long size = scan_areas[i].size;
701
702 for(; size; addr++, size -= sizeof(unsigned long)) {
703 if (!*addr)
704 continue;
705 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
706 addr, __pa(addr), *addr);
707 corruption = 1;
708 *addr = 0;
709 }
710 }
711
712 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
713}
714
715static void periodic_check_for_corruption(unsigned long data)
716{
717 check_for_bios_corruption();
718 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
719}
720
721void start_periodic_check_for_corruption(void)
722{
723 if (!memory_corruption_check || corruption_check_period == 0)
724 return;
725
726 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
727 corruption_check_period);
728
729 init_timer(&periodic_check_timer);
730 periodic_check_timer.function = &periodic_check_for_corruption;
731 periodic_check_for_corruption(0);
732}
733#endif
734
735static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
736{
737 printk(KERN_NOTICE
738 "%s detected: BIOS may corrupt low RAM, working it around.\n",
739 d->ident);
740
741 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
742 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
743
744 return 0;
745}
746
747/* List of systems that have known low memory corruption BIOS problems */
748static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
749#ifdef CONFIG_X86_RESERVE_LOW_64K
750 {
751 .callback = dmi_low_memory_corruption,
752 .ident = "AMI BIOS",
753 .matches = {
754 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
755 },
756 },
757 {
758 .callback = dmi_low_memory_corruption,
759 .ident = "Phoenix BIOS",
760 .matches = {
761 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
762 },
763 },
764#endif
765 {}
766};
767
768/*
582 * Determine if we were loaded by an EFI loader. If so, then we have also been 769 * Determine if we were loaded by an EFI loader. If so, then we have also been
583 * passed the efi memmap, systab, etc., so we should use these data structures 770 * passed the efi memmap, systab, etc., so we should use these data structures
584 * for initialization. Note, the efi init code path is determined by the 771 * for initialization. Note, the efi init code path is determined by the
@@ -665,11 +852,36 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 852 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 853 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 854
855#ifdef CONFIG_CMDLINE_BOOL
856#ifdef CONFIG_CMDLINE_OVERRIDE
857 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
858#else
859 if (builtin_cmdline[0]) {
860 /* append boot loader cmdline to builtin */
861 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
862 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
863 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
864 }
865#endif
866#endif
867
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 868 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line; 869 *cmdline_p = command_line;
670 870
671 parse_early_param(); 871 parse_early_param();
672 872
873#ifdef CONFIG_X86_64
874 check_efer();
875#endif
876
877#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
878 /*
879 * Must be before kernel pagetables are setup
880 * or fixmap area is touched.
881 */
882 vmi_init();
883#endif
884
673 /* after early param, so could get panic from serial */ 885 /* after early param, so could get panic from serial */
674 reserve_early_setup_data(); 886 reserve_early_setup_data();
675 887
@@ -687,6 +899,10 @@ void __init setup_arch(char **cmdline_p)
687 899
688 finish_e820_parsing(); 900 finish_e820_parsing();
689 901
902 dmi_scan_machine();
903
904 dmi_check_system(bad_bios_dmi_table);
905
690#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
691 probe_roms(); 907 probe_roms();
692#endif 908#endif
@@ -730,7 +946,8 @@ void __init setup_arch(char **cmdline_p)
730#else 946#else
731 num_physpages = max_pfn; 947 num_physpages = max_pfn;
732 948
733 check_efer(); 949 if (cpu_has_x2apic)
950 check_x2apic();
734 951
735 /* How many end-of-memory variables you have, grandma! */ 952 /* How many end-of-memory variables you have, grandma! */
736 /* need this before calling reserve_initrd */ 953 /* need this before calling reserve_initrd */
@@ -742,6 +959,10 @@ void __init setup_arch(char **cmdline_p)
742 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 959 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
743#endif 960#endif
744 961
962#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
963 setup_bios_corruption_check();
964#endif
965
745 /* max_pfn_mapped is updated here */ 966 /* max_pfn_mapped is updated here */
746 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 967 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
747 max_pfn_mapped = max_low_pfn_mapped; 968 max_pfn_mapped = max_low_pfn_mapped;
@@ -770,8 +991,6 @@ void __init setup_arch(char **cmdline_p)
770 vsmp_init(); 991 vsmp_init();
771#endif 992#endif
772 993
773 dmi_scan_machine();
774
775 io_delay_init(); 994 io_delay_init();
776 995
777 /* 996 /*
@@ -788,10 +1007,6 @@ void __init setup_arch(char **cmdline_p)
788 1007
789 initmem_init(0, max_pfn); 1008 initmem_init(0, max_pfn);
790 1009
791#ifdef CONFIG_X86_64
792 dma32_reserve_bootmem();
793#endif
794
795#ifdef CONFIG_ACPI_SLEEP 1010#ifdef CONFIG_ACPI_SLEEP
796 /* 1011 /*
797 * Reserve low memory region for sleep support. 1012 * Reserve low memory region for sleep support.
@@ -806,20 +1021,21 @@ void __init setup_arch(char **cmdline_p)
806#endif 1021#endif
807 reserve_crashkernel(); 1022 reserve_crashkernel();
808 1023
1024#ifdef CONFIG_X86_64
1025 /*
1026 * dma32_reserve_bootmem() allocates bootmem which may conflict
1027 * with the crashkernel command line, so do that after
1028 * reserve_crashkernel()
1029 */
1030 dma32_reserve_bootmem();
1031#endif
1032
809 reserve_ibft_region(); 1033 reserve_ibft_region();
810 1034
811#ifdef CONFIG_KVM_CLOCK 1035#ifdef CONFIG_KVM_CLOCK
812 kvmclock_init(); 1036 kvmclock_init();
813#endif 1037#endif
814 1038
815#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
816 /*
817 * Must be after max_low_pfn is determined, and before kernel
818 * pagetables are setup.
819 */
820 vmi_init();
821#endif
822
823 paravirt_pagetable_setup_start(swapper_pg_dir); 1039 paravirt_pagetable_setup_start(swapper_pg_dir);
824 paging_init(); 1040 paging_init();
825 paravirt_pagetable_setup_done(swapper_pg_dir); 1041 paravirt_pagetable_setup_done(swapper_pg_dir);
@@ -856,12 +1072,6 @@ void __init setup_arch(char **cmdline_p)
856 init_apic_mappings(); 1072 init_apic_mappings();
857 ioapic_init_mappings(); 1073 ioapic_init_mappings();
858 1074
859#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
860 if (def_to_bigsmp)
861 printk(KERN_WARNING "More than 8 CPUs detected and "
862 "CONFIG_X86_PC cannot handle it.\nUse "
863 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
864#endif
865 kvm_guest_init(); 1075 kvm_guest_init();
866 1076
867 e820_reserve_resources(); 1077 e820_reserve_resources();
@@ -883,3 +1093,5 @@ void __init setup_arch(char **cmdline_p)
883#endif 1093#endif
884#endif 1094#endif
885} 1095}
1096
1097
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index f7745f94c006..0e67f72d9316 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
80#endif 80#endif
81} 81}
82 82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
102/* 84/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it 85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -180,9 +162,16 @@ void __init setup_per_cpu_areas(void)
180 printk(KERN_INFO 162 printk(KERN_INFO
181 "cpu %d has no node %d or node-local memory\n", 163 "cpu %d has no node %d or node-local memory\n",
182 cpu, node); 164 cpu, node);
165 if (ptr)
166 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
167 cpu, __pa(ptr));
183 } 168 }
184 else 169 else {
185 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); 170 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
171 if (ptr)
172 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
173 cpu, node, __pa(ptr));
174 }
186#endif 175#endif
187 per_cpu_offset(cpu) = ptr - __per_cpu_start; 176 per_cpu_offset(cpu) = ptr - __per_cpu_start;
188 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 177 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
@@ -197,9 +186,6 @@ void __init setup_per_cpu_areas(void)
197 186
198 /* Setup node to cpumask map */ 187 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map(); 188 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203} 189}
204 190
205#endif 191#endif
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
27#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 6fb5bcdd8933..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -26,6 +27,8 @@
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
31#include <asm/syscalls.h>
29 32
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
110 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
111} 114}
112 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
113 137
114/* 138/*
115 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
118restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
119 unsigned long *pax) 143 unsigned long *pax)
120{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
121 unsigned int err = 0; 147 unsigned int err = 0;
122 148
123 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
124 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
125 151
126#define COPY(x) err |= __get_user(regs->x, &sc->x)
127
128#define COPY_SEG(seg) \
129 { unsigned short tmp; \
130 err |= __get_user(tmp, &sc->seg); \
131 regs->seg = tmp; }
132
133#define COPY_SEG_STRICT(seg) \
134 { unsigned short tmp; \
135 err |= __get_user(tmp, &sc->seg); \
136 regs->seg = tmp|3; }
137
138#define GET_SEG(seg) \
139 { unsigned short tmp; \
140 err |= __get_user(tmp, &sc->seg); \
141 loadsegment(seg, tmp); }
142
143 GET_SEG(gs); 152 GET_SEG(gs);
144 COPY_SEG(fs); 153 COPY_SEG(fs);
145 COPY_SEG(es); 154 COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
150 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
151 160
152 { 161 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 163 regs->orig_ax = -1; /* disable syscall checks */
155 err |= __get_user(tmpflags, &sc->flags);
156 regs->flags = (regs->flags & ~FIX_EFLAGS) |
157 (tmpflags & FIX_EFLAGS);
158 regs->orig_ax = -1; /* disable syscall checks */
159 }
160 164
161 { 165 err |= __get_user(buf, &sc->fpstate);
162 struct _fpstate __user *buf; 166 err |= restore_i387_xstate(buf);
163
164 err |= __get_user(buf, &sc->fpstate);
165 if (buf) {
166 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
167 goto badframe;
168 err |= restore_i387(buf);
169 } else {
170 struct task_struct *me = current;
171
172 if (used_math()) {
173 clear_fpu(me);
174 clear_used_math();
175 }
176 }
177 }
178 167
179 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
180 return err; 169 return err;
181
182badframe:
183 return 1;
184} 170}
185 171
186asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -226,9 +212,8 @@ badframe:
226 return 0; 212 return 0;
227} 213}
228 214
229asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
230{ 216{
231 struct pt_regs *regs = (struct pt_regs *)&__unused;
232 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
233 unsigned long ax; 218 unsigned long ax;
234 sigset_t set; 219 sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
254 return ax; 239 return ax;
255 240
256badframe: 241badframe:
257 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
258 return 0; 243 return 0;
259} 244}
260 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
261/* 253/*
262 * Set up a signal frame. 254 * Set up a signal frame.
263 */ 255 */
264static int 256static int
265setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
266 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
267{ 259{
268 int tmp, err = 0; 260 int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
289 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
290 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
291 283
292 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
293 if (tmp < 0) 285 if (tmp < 0)
294 err = 1; 286 err = 1;
295 else 287 else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
306 * Determine which stack to use.. 298 * Determine which stack to use..
307 */ 299 */
308static inline void __user * 300static inline void __user *
309get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
310{ 303{
311 unsigned long sp; 304 unsigned long sp;
312 305
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
332 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
333 } 326 }
334 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
335 sp -= frame_size; 333 sp -= frame_size;
336 /* 334 /*
337 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
343} 341}
344 342
345static int 343static int
346setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
347 struct pt_regs *regs) 345 struct pt_regs *regs)
348{ 346{
349 struct sigframe __user *frame; 347 struct sigframe __user *frame;
350 void __user *restorer; 348 void __user *restorer;
351 int err = 0; 349 int err = 0;
352 int usig; 350 void __user *fpstate = NULL;
353 351
354 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
355 353
356 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
357 goto give_sigsegv; 355 return -EFAULT;
358 356
359 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
360 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
361 && sig < 32
362 ? current_thread_info()->exec_domain->signal_invmap[sig]
363 : sig;
364 359
365 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
366 if (err) 361 return -EFAULT;
367 goto give_sigsegv;
368
369 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
370 if (err)
371 goto give_sigsegv;
372 362
373 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
374 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
375 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
376 if (err) 366 return -EFAULT;
377 goto give_sigsegv;
378 } 367 }
379 368
380 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400 389
401 if (err) 390 if (err)
402 goto give_sigsegv; 391 return -EFAULT;
403 392
404 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
405 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
414 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
415 404
416 return 0; 405 return 0;
417
418give_sigsegv:
419 force_sigsegv(sig, current);
420 return -EFAULT;
421} 406}
422 407
423static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
424 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
425{ 410{
426 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
427 void __user *restorer; 412 void __user *restorer;
428 int err = 0; 413 int err = 0;
429 int usig; 414 void __user *fpstate = NULL;
430 415
431 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
432 417
433 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
434 goto give_sigsegv; 419 return -EFAULT;
435 420
436 usig = current_thread_info()->exec_domain 421 err |= __put_user(sig, &frame->sig);
437 && current_thread_info()->exec_domain->signal_invmap
438 && sig < 32
439 ? current_thread_info()->exec_domain->signal_invmap[sig]
440 : sig;
441
442 err |= __put_user(usig, &frame->sig);
443 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
444 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
445 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
446 if (err) 425 if (err)
447 goto give_sigsegv; 426 return -EFAULT;
448 427
449 /* Create the ucontext. */ 428 /* Create the ucontext. */
450 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
451 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
452 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
453 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
454 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
455 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
456 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
457 regs, set->sig[0]); 439 regs, set->sig[0]);
458 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
459 if (err) 441 if (err)
460 goto give_sigsegv; 442 return -EFAULT;
461 443
462 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
463 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
477 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
478 460
479 if (err) 461 if (err)
480 goto give_sigsegv; 462 return -EFAULT;
481 463
482 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
483 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
484 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
485 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
486 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
487 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
488 470
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
492 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
493 475
494 return 0; 476 return 0;
495
496give_sigsegv:
497 force_sigsegv(sig, current);
498 return -EFAULT;
499} 477}
500 478
501/* 479/*
502 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
503 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
504static int 519static int
505handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
506 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 int ret; 523 int ret;
509 524
510 /* Are we from a system call? */ 525 /* Are we from a system call? */
511 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
512 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
513 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
514 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
515 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
516 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
538 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
539 554
540 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
541 if (ka->sa.sa_flags & SA_SIGINFO)
542 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 else
544 ret = setup_frame(sig, ka, oldset, regs);
545 556
546 if (ret) 557 if (ret)
547 return ret; 558 return ret;
548 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
549 /* 569 /*
550 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
551 */ 571 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
558 * handler too. 578 * handler too.
559 */ 579 */
560 regs->flags &= ~X86_EFLAGS_TF; 580 regs->flags &= ~X86_EFLAGS_TF;
561 if (test_thread_flag(TIF_SINGLESTEP))
562 ptrace_notify(SIGTRAP);
563 581
564 spin_lock_irq(&current->sighand->siglock); 582 spin_lock_irq(&current->sighand->siglock);
565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 583 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
568 recalc_sigpending(); 586 recalc_sigpending();
569 spin_unlock_irq(&current->sighand->siglock); 587 spin_unlock_irq(&current->sighand->siglock);
570 588
589 tracehook_signal_handler(sig, info, ka, regs,
590 test_thread_flag(TIF_SINGLESTEP));
591
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,9 +679,38 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
683 /* notify userspace of pending MCEs */
684 if (thread_info_flags & _TIF_MCE_NOTIFY)
685 mce_notify_user();
686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
687
660 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
661 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
662 do_signal(regs); 690 do_signal(regs);
663 691
692 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
693 clear_thread_flag(TIF_NOTIFY_RESUME);
694 tracehook_notify_resume(regs);
695 }
696
697#ifdef CONFIG_X86_32
664 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
665} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index b45ef8ddd651..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
29#include "sigframe.h" 33#include "sigframe.h"
30 34
31#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
41# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
42#endif 46#endif
43 47
44int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
45 sigset_t *set, struct pt_regs * regs);
46int ia32_setup_frame(int sig, struct k_sigaction *ka,
47 sigset_t *set, struct pt_regs * regs);
48
49asmlinkage long 48asmlinkage long
50sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
51 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -53,58 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
54} 53}
55 54
56/* 55#define COPY(x) { \
57 * Signal frame handlers. 56 err |= __get_user(regs->x, &sc->x); \
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87} 57}
88 58
89/* 59#define COPY_SEG_STRICT(seg) { \
90 * This restores directly out of user space. Exceptions are handled. 60 unsigned short tmp; \
91 */ 61 err |= __get_user(tmp, &sc->seg); \
92static inline int restore_i387(struct _fpstate __user *buf) 62 regs->seg = tmp | 3; \
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108} 63}
109 64
110/* 65/*
@@ -114,13 +69,13 @@ static int
114restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
115 unsigned long *pax) 70 unsigned long *pax)
116{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
117 unsigned int err = 0; 74 unsigned int err = 0;
118 75
119 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
120 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
121 78
122#define COPY(x) err |= __get_user(regs->x, &sc->x)
123
124 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
125 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
126 COPY(r8); 81 COPY(r8);
@@ -135,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
135 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
136 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
137 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
138 { 93 COPY_SEG_STRICT(cs);
139 unsigned cs;
140 err |= __get_user(cs, &sc->cs);
141 regs->cs = cs | 3; /* Force into user mode */
142 }
143 94
144 { 95 err |= __get_user(tmpflags, &sc->flags);
145 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
146 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
147 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
148 regs->orig_ax = -1; /* disable syscall checks */
149 }
150 98
151 { 99 err |= __get_user(buf, &sc->fpstate);
152 struct _fpstate __user * buf; 100 err |= restore_i387_xstate(buf);
153 err |= __get_user(buf, &sc->fpstate);
154
155 if (buf) {
156 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
157 goto badframe;
158 err |= restore_i387(buf);
159 } else {
160 struct task_struct *me = current;
161 if (used_math()) {
162 clear_fpu(me);
163 clear_used_math();
164 }
165 }
166 }
167 101
168 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
169 return err; 103 return err;
170
171badframe:
172 return 1;
173} 104}
174 105
175asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
176{ 107{
177 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
178 sigset_t set;
179 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
180 111
181 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -189,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
189 current->blocked = set; 120 current->blocked = set;
190 recalc_sigpending(); 121 recalc_sigpending();
191 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
192 123
193 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
194 goto badframe; 125 goto badframe;
195 126
@@ -199,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
199 return ax; 130 return ax;
200 131
201badframe: 132badframe:
202 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
203 return 0; 134 return 0;
204} 135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
205 141
206/* 142/*
207 * Set up a signal frame. 143 * Set up a signal frame.
208 */ 144 */
209 145
210static inline int 146static inline int
211setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
212{ 149{
213 int err = 0; 150 int err = 0;
214 151
@@ -260,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
260 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
261 } 198 }
262 199
263 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
264} 201}
265 202
266static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
267 sigset_t *set, struct pt_regs * regs) 204 sigset_t *set, struct pt_regs *regs)
268{ 205{
269 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
270 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
271 int err = 0; 208 int err = 0;
272 struct task_struct *me = current; 209 struct task_struct *me = current;
273 210
274 if (used_math()) { 211 if (used_math()) {
275 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
276 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
277 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
278 215
279 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
280 goto give_sigsegv; 217 return -EFAULT;
281
282 if (save_i387(fp) < 0)
283 err |= -1;
284 } else 218 } else
285 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
286 220
287 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
288 goto give_sigsegv; 222 return -EFAULT;
289 223
290 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
291 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
292 if (err) 226 return -EFAULT;
293 goto give_sigsegv;
294 } 227 }
295 228
296 /* Create the ucontext. */ 229 /* Create the ucontext. */
297 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
298 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
299 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
300 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -302,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
302 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
303 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
304 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
305 if (sizeof(*set) == 16) { 241 if (sizeof(*set) == 16) {
306 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
307 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
308 } else 244 } else
309 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
310 246
@@ -315,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
315 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
316 } else { 252 } else {
317 /* could use a vstub here */ 253 /* could use a vstub here */
318 goto give_sigsegv; 254 return -EFAULT;
319 } 255 }
320 256
321 if (err) 257 if (err)
322 goto give_sigsegv; 258 return -EFAULT;
323 259
324 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
325 regs->di = sig; 261 regs->di = sig;
326 /* In case the signal handler was declared without prototypes */ 262 /* In case the signal handler was declared without prototypes */
327 regs->ax = 0; 263 regs->ax = 0;
328 264
329 /* This also works for non SA_SIGINFO handlers because they expect the 265 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -339,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
339 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
340 276
341 return 0; 277 return 0;
342
343give_sigsegv:
344 force_sigsegv(sig, current);
345 return -EFAULT;
346} 278}
347 279
348/* 280/*
349 * Return -1L or the syscall number that @regs is executing. 281 * OK, we're invoking a handler
350 */ 282 */
351static long current_syscall(struct pt_regs *regs) 283static int signr_convert(int sig)
352{ 284{
353 /* 285 return sig;
354 * We always sign-extend a -1 value being set here,
355 * so this is always either -1L or a syscall number.
356 */
357 return regs->orig_ax;
358} 286}
359 287
360/*
361 * Return a value that is -EFOO if the system call in @regs->orig_ax
362 * returned an error. This only works for @regs from @current.
363 */
364static long current_syscall_ret(struct pt_regs *regs)
365{
366#ifdef CONFIG_IA32_EMULATION 288#ifdef CONFIG_IA32_EMULATION
367 if (test_thread_flag(TIF_IA32)) 289#define is_ia32 test_thread_flag(TIF_IA32)
368 /* 290#else
369 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO 291#define is_ia32 0
370 * and will match correctly in comparisons.
371 */
372 return (int) regs->ax;
373#endif 292#endif
374 return regs->ax;
375}
376 293
377/* 294static int
378 * OK, we're invoking a handler 295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
379 */ 296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
380 317
381static int 318static int
382handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -385,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
385 int ret; 322 int ret;
386 323
387 /* Are we from a system call? */ 324 /* Are we from a system call? */
388 if (current_syscall(regs) >= 0) { 325 if (syscall_get_nr(current, regs) >= 0) {
389 /* If so, check system call restarting.. */ 326 /* If so, check system call restarting.. */
390 switch (current_syscall_ret(regs)) { 327 switch (syscall_get_error(current, regs)) {
391 case -ERESTART_RESTARTBLOCK: 328 case -ERESTART_RESTARTBLOCK:
392 case -ERESTARTNOHAND: 329 case -ERESTARTNOHAND:
393 regs->ax = -EINTR; 330 regs->ax = -EINTR;
@@ -414,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
414 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
415 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
416 353
417#ifdef CONFIG_IA32_EMULATION
418 if (test_thread_flag(TIF_IA32)) {
419 if (ka->sa.sa_flags & SA_SIGINFO)
420 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
421 else
422 ret = ia32_setup_frame(sig, ka, oldset, regs);
423 } else
424#endif
425 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
426 355
427 if (ret == 0) { 356 if (ret)
428 /* 357 return ret;
429 * This has nothing to do with segment registers,
430 * despite the name. This magic affects uaccess.h
431 * macros' behavior. Reset it to the normal setting.
432 */
433 set_fs(USER_DS);
434 358
435 /* 359#ifdef CONFIG_X86_64
436 * Clear the direction flag as per the ABI for function entry. 360 /*
437 */ 361 * This has nothing to do with segment registers,
438 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
439 367
440 /* 368 /*
441 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
442 * notify any tracer that was single-stepping it. 370 */
443 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
444 * handler too.
445 */
446 regs->flags &= ~X86_EFLAGS_TF;
447 if (test_thread_flag(TIF_SINGLESTEP))
448 ptrace_notify(SIGTRAP);
449
450 spin_lock_irq(&current->sighand->siglock);
451 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
452 if (!(ka->sa.sa_flags & SA_NODEFER))
453 sigaddset(&current->blocked,sig);
454 recalc_sigpending();
455 spin_unlock_irq(&current->sighand->siglock);
456 }
457 372
458 return ret; 373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
459} 392}
460 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
461/* 396/*
462 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
463 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -487,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
487 422
488 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
489 if (signr > 0) { 424 if (signr > 0) {
490 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
491 * signal to user space. The processor register will 427 * signal to user space. The processor register will
492 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
493 * inside the kernel. 429 * inside the kernel.
@@ -495,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
495 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
496 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
497 433
498 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
499 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
500 /* 436 /*
501 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -509,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
509 } 445 }
510 446
511 /* Did we come from a system call? */ 447 /* Did we come from a system call? */
512 if (current_syscall(regs) >= 0) { 448 if (syscall_get_nr(current, regs) >= 0) {
513 /* Restart the system call - no handlers present */ 449 /* Restart the system call - no handlers present */
514 switch (current_syscall_ret(regs)) { 450 switch (syscall_get_error(current, regs)) {
515 case -ERESTARTNOHAND: 451 case -ERESTARTNOHAND:
516 case -ERESTARTSYS: 452 case -ERESTARTSYS:
517 case -ERESTARTNOINTR: 453 case -ERESTARTNOINTR:
518 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
519 regs->ip -= 2; 455 regs->ip -= 2;
520 break; 456 break;
457
521 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
522 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
523 __NR_ia32_restart_syscall :
524 __NR_restart_syscall;
525 regs->ip -= 2; 460 regs->ip -= 2;
526 break; 461 break;
527 } 462 }
@@ -537,29 +472,45 @@ static void do_signal(struct pt_regs *regs)
537 } 472 }
538} 473}
539 474
540void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
541 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
542{ 481{
543#ifdef CONFIG_X86_MCE 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
544 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
545 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
546 mce_notify_user(); 485 mce_notify_user();
547#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
548 487
549 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
550 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
551 do_signal(regs); 490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
552} 500}
553 501
554void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
555{ 503{
556 struct task_struct *me = current; 504 struct task_struct *me = current;
505
557 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
558 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
559 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
560 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
561 printk("\n"); 512 printk(KERN_CONT "\n");
562 } 513 }
563 514
564 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
565} 516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 332512767f4f..76b6f50978f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
88#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 89#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
89#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 90#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
90#else 91#else
91struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 92static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
92#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 93#define get_idle_for_cpu(x) (idle_thread_array[(x)])
93#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) 94#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
94#endif 95#endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
123 124
124static atomic_t init_deasserted; 125static atomic_t init_deasserted;
125 126
126static int boot_cpu_logical_apicid;
127 127
128/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
130 130
131/* Set if we find a B stepping CPU */ 131/* Set if we find a B stepping CPU */
132int __cpuinitdata smp_b_stepping; 132static int __cpuinitdata smp_b_stepping;
133 133
134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
135 135
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
165#endif 165#endif
166 166
167#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
168u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
169 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
170 172
@@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void)
210 /* 212 /*
211 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
212 */ 214 */
213 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
214 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
215 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@ -257,6 +259,7 @@ static void __cpuinit smp_callin(void)
257 end_local_APIC_setup(); 259 end_local_APIC_setup();
258 map_cpu_to_logical_apicid(); 260 map_cpu_to_logical_apicid();
259 261
262 notify_cpu_starting(cpuid);
260 /* 263 /*
261 * Get our bogomips. 264 * Get our bogomips.
262 * 265 *
@@ -326,12 +329,16 @@ static void __cpuinit start_secondary(void *unused)
326 * for which cpus receive the IPI. Holding this 329 * for which cpus receive the IPI. Holding this
327 * lock helps us to not include this cpu in a currently in progress 330 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 331 * smp_call_function().
332 *
333 * We need to hold vector_lock so there the set of online cpus
334 * does not change while we are assigning vectors to cpus. Holding
335 * this lock ensures we don't half assign or remove an irq from a cpu.
329 */ 336 */
330 ipi_call_lock_irq(); 337 ipi_call_lock_irq();
331#ifdef CONFIG_X86_IO_APIC 338 lock_vector_lock();
332 setup_vector_irq(smp_processor_id()); 339 __setup_vector_irq(smp_processor_id());
333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 340 cpu_set(smp_processor_id(), cpu_online_map);
341 unlock_vector_lock();
335 ipi_call_unlock_irq(); 342 ipi_call_unlock_irq();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 343 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 344
@@ -546,8 +553,7 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 553 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 554 "a previous APIC delivery may have failed\n");
548 555
549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 556 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 557
552 timeout = 0; 558 timeout = 0;
553 do { 559 do {
@@ -579,11 +585,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 585 int maxlvt;
580 586
581 /* Target chip */ 587 /* Target chip */
582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583
584 /* Boot on the stack */ 588 /* Boot on the stack */
585 /* Kick the second */ 589 /* Kick the second */
586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 590 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
587 591
588 pr_debug("Waiting for send to finish...\n"); 592 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 593 send_status = safe_apic_wait_icr_idle();
@@ -636,13 +640,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
636 /* 640 /*
637 * Turn INIT on target chip 641 * Turn INIT on target chip
638 */ 642 */
639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
640
641 /* 643 /*
642 * Send IPI 644 * Send IPI
643 */ 645 */
644 apic_write(APIC_ICR, 646 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); 647 phys_apicid);
646 648
647 pr_debug("Waiting for send to finish...\n"); 649 pr_debug("Waiting for send to finish...\n");
648 send_status = safe_apic_wait_icr_idle(); 650 send_status = safe_apic_wait_icr_idle();
@@ -652,10 +654,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
652 pr_debug("Deasserting INIT.\n"); 654 pr_debug("Deasserting INIT.\n");
653 655
654 /* Target chip */ 656 /* Target chip */
655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
656
657 /* Send IPI */ 657 /* Send IPI */
658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
659 659
660 pr_debug("Waiting for send to finish...\n"); 660 pr_debug("Waiting for send to finish...\n");
661 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
@@ -698,11 +698,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
698 */ 698 */
699 699
700 /* Target chip */ 700 /* Target chip */
701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
702
703 /* Boot on the stack */ 701 /* Boot on the stack */
704 /* Kick the second */ 702 /* Kick the second */
705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); 703 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
704 phys_apicid);
706 705
707 /* 706 /*
708 * Give the other CPU some time to accept the IPI. 707 * Give the other CPU some time to accept the IPI.
@@ -752,6 +751,14 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
752} 751}
753 752
754#ifdef CONFIG_X86_64 753#ifdef CONFIG_X86_64
754
755/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
756static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
757{
758 if (!after_bootmem)
759 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
760}
761
755/* 762/*
756 * Allocate node local memory for the AP pda. 763 * Allocate node local memory for the AP pda.
757 * 764 *
@@ -780,8 +787,7 @@ int __cpuinit get_local_pda(int cpu)
780 787
781 if (oldpda) { 788 if (oldpda) {
782 memcpy(newpda, oldpda, size); 789 memcpy(newpda, oldpda, size);
783 if (!after_bootmem) 790 free_bootmem_pda(oldpda);
784 free_bootmem((unsigned long)oldpda, size);
785 } 791 }
786 792
787 newpda->in_bootmem = 0; 793 newpda->in_bootmem = 0;
@@ -1044,6 +1050,34 @@ static __init void disable_smp(void)
1044static int __init smp_sanity_check(unsigned max_cpus) 1050static int __init smp_sanity_check(unsigned max_cpus)
1045{ 1051{
1046 preempt_disable(); 1052 preempt_disable();
1053
1054#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1055 if (def_to_bigsmp && nr_cpu_ids > 8) {
1056 unsigned int cpu;
1057 unsigned nr;
1058
1059 printk(KERN_WARNING
1060 "More than 8 CPUs detected - skipping them.\n"
1061 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1062
1063 nr = 0;
1064 for_each_present_cpu(cpu) {
1065 if (nr >= 8)
1066 cpu_clear(cpu, cpu_present_map);
1067 nr++;
1068 }
1069
1070 nr = 0;
1071 for_each_possible_cpu(cpu) {
1072 if (nr >= 8)
1073 cpu_clear(cpu, cpu_possible_map);
1074 nr++;
1075 }
1076
1077 nr_cpu_ids = 8;
1078 }
1079#endif
1080
1047 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1081 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1048 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1082 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1049 "by the BIOS.\n", hard_smp_processor_id()); 1083 "by the BIOS.\n", hard_smp_processor_id());
@@ -1136,10 +1170,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1136 * Setup boot CPU information 1170 * Setup boot CPU information
1137 */ 1171 */
1138 smp_store_cpu_info(0); /* Final full version of the data */ 1172 smp_store_cpu_info(0); /* Final full version of the data */
1173#ifdef CONFIG_X86_32
1139 boot_cpu_logical_apicid = logical_smp_processor_id(); 1174 boot_cpu_logical_apicid = logical_smp_processor_id();
1175#endif
1140 current_thread_info()->cpu = 0; /* needed? */ 1176 current_thread_info()->cpu = 0; /* needed? */
1141 set_cpu_sibling_map(0); 1177 set_cpu_sibling_map(0);
1142 1178
1179#ifdef CONFIG_X86_64
1180 enable_IR_x2apic();
1181 setup_apic_routing();
1182#endif
1183
1143 if (smp_sanity_check(max_cpus) < 0) { 1184 if (smp_sanity_check(max_cpus) < 0) {
1144 printk(KERN_INFO "SMP disabled\n"); 1185 printk(KERN_INFO "SMP disabled\n");
1145 disable_smp(); 1186 disable_smp();
@@ -1147,9 +1188,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1147 } 1188 }
1148 1189
1149 preempt_disable(); 1190 preempt_disable();
1150 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1191 if (read_apic_id() != boot_cpu_physical_apicid) {
1151 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1192 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1152 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1193 read_apic_id(), boot_cpu_physical_apicid);
1153 /* Or can we switch back to PIC here? */ 1194 /* Or can we switch back to PIC here? */
1154 } 1195 }
1155 preempt_enable(); 1196 preempt_enable();
@@ -1182,6 +1223,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1182 printk(KERN_INFO "CPU%d: ", 0); 1223 printk(KERN_INFO "CPU%d: ", 0);
1183 print_cpu_info(&cpu_data(0)); 1224 print_cpu_info(&cpu_data(0));
1184 setup_boot_clock(); 1225 setup_boot_clock();
1226
1227 if (is_uv_system())
1228 uv_system_init();
1185out: 1229out:
1186 preempt_enable(); 1230 preempt_enable();
1187} 1231}
@@ -1271,16 +1315,13 @@ __init void prefill_possible_map(void)
1271 if (!num_processors) 1315 if (!num_processors)
1272 num_processors = 1; 1316 num_processors = 1;
1273 1317
1274#ifdef CONFIG_HOTPLUG_CPU
1275 if (additional_cpus == -1) { 1318 if (additional_cpus == -1) {
1276 if (disabled_cpus > 0) 1319 if (disabled_cpus > 0)
1277 additional_cpus = disabled_cpus; 1320 additional_cpus = disabled_cpus;
1278 else 1321 else
1279 additional_cpus = 0; 1322 additional_cpus = 0;
1280 } 1323 }
1281#else 1324
1282 additional_cpus = 0;
1283#endif
1284 possible = num_processors + additional_cpus; 1325 possible = num_processors + additional_cpus;
1285 if (possible > NR_CPUS) 1326 if (possible > NR_CPUS)
1286 possible = NR_CPUS; 1327 possible = NR_CPUS;
@@ -1304,7 +1345,29 @@ static void __ref remove_cpu_from_maps(int cpu)
1304 numa_remove_cpu(cpu); 1345 numa_remove_cpu(cpu);
1305} 1346}
1306 1347
1307int __cpu_disable(void) 1348void cpu_disable_common(void)
1349{
1350 int cpu = smp_processor_id();
1351 /*
1352 * HACK:
1353 * Allow any queued timer interrupts to get serviced
1354 * This is only a temporary solution until we cleanup
1355 * fixup_irqs as we do for IA64.
1356 */
1357 local_irq_enable();
1358 mdelay(1);
1359
1360 local_irq_disable();
1361 remove_siblinginfo(cpu);
1362
1363 /* It's now safe to remove this processor from the online map */
1364 lock_vector_lock();
1365 remove_cpu_from_maps(cpu);
1366 unlock_vector_lock();
1367 fixup_irqs(cpu_online_map);
1368}
1369
1370int native_cpu_disable(void)
1308{ 1371{
1309 int cpu = smp_processor_id(); 1372 int cpu = smp_processor_id();
1310 1373
@@ -1323,25 +1386,11 @@ int __cpu_disable(void)
1323 stop_apic_nmi_watchdog(NULL); 1386 stop_apic_nmi_watchdog(NULL);
1324 clear_local_APIC(); 1387 clear_local_APIC();
1325 1388
1326 /* 1389 cpu_disable_common();
1327 * HACK:
1328 * Allow any queued timer interrupts to get serviced
1329 * This is only a temporary solution until we cleanup
1330 * fixup_irqs as we do for IA64.
1331 */
1332 local_irq_enable();
1333 mdelay(1);
1334
1335 local_irq_disable();
1336 remove_siblinginfo(cpu);
1337
1338 /* It's now safe to remove this processor from the online map */
1339 remove_cpu_from_maps(cpu);
1340 fixup_irqs(cpu_online_map);
1341 return 0; 1390 return 0;
1342} 1391}
1343 1392
1344void __cpu_die(unsigned int cpu) 1393void native_cpu_die(unsigned int cpu)
1345{ 1394{
1346 /* We don't do anything here: idle task is faking death itself. */ 1395 /* We don't do anything here: idle task is faking death itself. */
1347 unsigned int i; 1396 unsigned int i;
@@ -1358,29 +1407,45 @@ void __cpu_die(unsigned int cpu)
1358 } 1407 }
1359 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1408 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1360} 1409}
1410
1411void play_dead_common(void)
1412{
1413 idle_task_exit();
1414 reset_lazy_tlbstate();
1415 irq_ctx_exit(raw_smp_processor_id());
1416 c1e_remove_cpu(raw_smp_processor_id());
1417
1418 mb();
1419 /* Ack it */
1420 __get_cpu_var(cpu_state) = CPU_DEAD;
1421
1422 /*
1423 * With physical CPU hotplug, we should halt the cpu
1424 */
1425 local_irq_disable();
1426}
1427
1428void native_play_dead(void)
1429{
1430 play_dead_common();
1431 wbinvd_halt();
1432}
1433
1361#else /* ... !CONFIG_HOTPLUG_CPU */ 1434#else /* ... !CONFIG_HOTPLUG_CPU */
1362int __cpu_disable(void) 1435int native_cpu_disable(void)
1363{ 1436{
1364 return -ENOSYS; 1437 return -ENOSYS;
1365} 1438}
1366 1439
1367void __cpu_die(unsigned int cpu) 1440void native_cpu_die(unsigned int cpu)
1368{ 1441{
1369 /* We said "no" in __cpu_disable */ 1442 /* We said "no" in __cpu_disable */
1370 BUG(); 1443 BUG();
1371} 1444}
1372#endif
1373 1445
1374/* 1446void native_play_dead(void)
1375 * If the BIOS enumerates physical processors before logical,
1376 * maxcpus=N at enumeration-time can be used to disable HT.
1377 */
1378static int __init parse_maxcpus(char *arg)
1379{ 1447{
1380 extern unsigned int maxcpus; 1448 BUG();
1381
1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1384 return 0;
1385} 1449}
1386early_param("maxcpus", parse_maxcpus); 1450
1451#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044ba..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a60..1884a8d12bfa 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/unistd.h> 23#include <linux/unistd.h>
24 24
25#include <asm/syscalls.h>
26
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 28 unsigned long prot, unsigned long flags,
27 unsigned long fd, unsigned long pgoff) 29 unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h>
19 20
20asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
21 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
22{ 24{
23 long error; 25 long error;
24 struct file * file; 26 struct file *file;
25 27
26 error = -EINVAL; 28 error = -EINVAL;
27 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
56 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
57 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
58 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
59 of playground for now. -AK */ 61 of playground for now. -AK */
60 *begin = 0x40000000; 62 *begin = 0x40000000;
61 *end = 0x80000000; 63 *end = 0x80000000;
62 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
63 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
64 if (new_begin) 66 if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
66 } 68 }
67 } else { 69 } else {
68 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
69 *end = TASK_SIZE; 71 *end = TASK_SIZE;
70 } 72 }
71} 73}
72 74
73unsigned long 75unsigned long
74arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
78 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
79 unsigned long start_addr; 81 unsigned long start_addr;
80 unsigned long begin, end; 82 unsigned long begin, end;
81 83
82 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
83 return addr; 85 return addr;
84 86
85 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
86 88
87 if (len > end) 89 if (len > end)
88 return -ENOMEM; 90 return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
96 } 98 }
97 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
98 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
99 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
100 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
101 } 103 }
102 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
103 if (addr < begin) 105 if (addr < begin)
104 addr = begin; 106 addr = begin;
105 start_addr = addr; 107 start_addr = addr;
106 108
107full_search: 109full_search:
@@ -127,7 +129,7 @@ full_search:
127 return addr; 129 return addr;
128 } 130 }
129 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
130 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
131 133
132 addr = vma->vm_end; 134 addr = vma->vm_end;
133 } 135 }
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
177 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
178 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
179 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
180 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
181 } 183 }
182 184
183 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
194 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
195 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
196 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
198 200
199 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
200 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
224} 226}
225 227
226 228
227asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
228{ 230{
229 int err; 231 int err;
230 down_read(&uts_sem); 232 down_read(&uts_sem);
231 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
232 up_read(&uts_sem); 234 up_read(&uts_sem);
233 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
234 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
235 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
236} 238}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c17487..3d1be4f0fac5 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
8#define __NO_STUBS 8#define __NO_STUBS
9 9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_ 11#undef ASM_X86__UNISTD_64_H
12#include <asm/unistd_64.h> 12#include <asm/unistd_64.h>
13 13
14#undef __SYSCALL 14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym, 15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_64_UNISTD_H_ 16#undef ASM_X86__UNISTD_64_H
17 17
18typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
19 19
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index ffe3c664afc0..bbecf8b6bf96 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h>
39 40
40#include "do_timer.h" 41#include "do_timer.h"
41 42
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..8b8c0d6640fa 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -17,6 +17,7 @@
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
20 21
21#include <mach_apic.h> 22#include <mach_apic.h>
22 23
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
783 uv_init_blade(blade, node, cur_cpu); 784 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade); 785 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 } 786 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts(); 788 uv_enable_timeouts();
788 789
789 return 0; 790 return 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
10#include <asm/ldt.h> 10#include <asm/ldt.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/syscalls.h>
13 14
14#include "tls.h" 15#include "tls.h"
15 16
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 03df8e45e5a1..0429c5de5ea9 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
891{ 891{
892 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
893 unsigned int condition; 893 unsigned int condition;
894 int si_code;
894 895
895 trace_hardirqs_fixup(); 896 trace_hardirqs_fixup();
896 897
@@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
935 goto clear_TF_reenable; 936 goto clear_TF_reenable;
936 } 937 }
937 938
939 si_code = get_si_code((unsigned long)condition);
938 /* Ok, finally something we can handle */ 940 /* Ok, finally something we can handle */
939 send_sigtrap(tsk, regs, error_code); 941 send_sigtrap(tsk, regs, error_code, si_code);
940 942
941 /* 943 /*
942 * Disable additional traps. They'll be re-enabled when 944 * Disable additional traps. They'll be re-enabled when
@@ -1228,7 +1230,6 @@ void __init trap_init(void)
1228 1230
1229 set_bit(SYSCALL_VECTOR, used_vectors); 1231 set_bit(SYSCALL_VECTOR, used_vectors);
1230 1232
1231 init_thread_xstate();
1232 /* 1233 /*
1233 * Should be a barrier for any external CPU state: 1234 * Should be a barrier for any external CPU state:
1234 */ 1235 */
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 3f18d73f420c..9c0ac0cab013 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/nmi.h> 33#include <linux/nmi.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
35 37
36#if defined(CONFIG_EDAC) 38#if defined(CONFIG_EDAC)
37#include <linux/edac.h> 39#include <linux/edac.h>
@@ -45,9 +47,6 @@
45#include <asm/unwind.h> 47#include <asm/unwind.h>
46#include <asm/desc.h> 48#include <asm/desc.h>
47#include <asm/i387.h> 49#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
52#include <asm/proto.h> 51#include <asm/proto.h>
53#include <asm/pda.h> 52#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
85 84
86void printk_address(unsigned long address, int reliable) 85void printk_address(unsigned long address, int reliable)
87{ 86{
88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); 87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89} 89}
90 90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
98 [STACKFAULT_STACK - 1] = "#SS", 98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC", 99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ 100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
102#endif 103#endif
103 }; 104 };
104 unsigned k; 105 unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
163} 164}
164 165
165/* 166/*
166 * x86-64 can have up to three kernel stacks: 167 * x86-64 can have up to three kernel stacks:
167 * process stack 168 * process stack
168 * interrupt stack 169 * interrupt stack
169 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 const struct stacktrace_ops *ops, void *data) 220 const struct stacktrace_ops *ops, void *data)
220{ 221{
221 const unsigned cpu = get_cpu(); 222 const unsigned cpu = get_cpu();
222 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; 223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
223 unsigned used = 0; 224 unsigned used = 0;
224 struct thread_info *tinfo; 225 struct thread_info *tinfo;
225 226
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
237 if (!bp) { 238 if (!bp) {
238 if (task == current) { 239 if (task == current) {
239 /* Grab bp right from our regs */ 240 /* Grab bp right from our regs */
240 asm("movq %%rbp, %0" : "=r" (bp) :); 241 asm("movq %%rbp, %0" : "=r" (bp) : );
241 } else { 242 } else {
242 /* bp is the last reg pushed by switch_to */ 243 /* bp is the last reg pushed by switch_to */
243 bp = *(unsigned long *) task->thread.sp; 244 bp = *(unsigned long *) task->thread.sp;
@@ -339,9 +340,8 @@ static void
339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl) 341 unsigned long *stack, unsigned long bp, char *log_lvl)
341{ 342{
342 printk("\nCall Trace:\n"); 343 printk("Call Trace:\n");
343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
344 printk("\n");
345} 345}
346 346
347void show_trace(struct task_struct *task, struct pt_regs *regs, 347void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -357,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
357 unsigned long *stack; 357 unsigned long *stack;
358 int i; 358 int i;
359 const int cpu = smp_processor_id(); 359 const int cpu = smp_processor_id();
360 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 360 unsigned long *irqstack_end =
361 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
362 364
363 // debugging aid: "show_stack(NULL, NULL);" prints the 365 /*
364 // back trace for this cpu. 366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
365 369
366 if (sp == NULL) { 370 if (sp == NULL) {
367 if (task) 371 if (task)
@@ -386,6 +390,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
386 printk(" %016lx", *stack++); 390 printk(" %016lx", *stack++);
387 touch_nmi_watchdog(); 391 touch_nmi_watchdog();
388 } 392 }
393 printk("\n");
389 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 394 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
390} 395}
391 396
@@ -404,7 +409,7 @@ void dump_stack(void)
404 409
405#ifdef CONFIG_FRAME_POINTER 410#ifdef CONFIG_FRAME_POINTER
406 if (!bp) 411 if (!bp)
407 asm("movq %%rbp, %0" : "=r" (bp):); 412 asm("movq %%rbp, %0" : "=r" (bp) : );
408#endif 413#endif
409 414
410 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
414 init_utsname()->version); 419 init_utsname()->version);
415 show_trace(NULL, NULL, &stack, bp); 420 show_trace(NULL, NULL, &stack, bp);
416} 421}
417
418EXPORT_SYMBOL(dump_stack); 422EXPORT_SYMBOL(dump_stack);
419 423
420void show_registers(struct pt_regs *regs) 424void show_registers(struct pt_regs *regs)
@@ -443,7 +447,6 @@ void show_registers(struct pt_regs *regs)
443 printk("Stack: "); 447 printk("Stack: ");
444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, ""); 449 regs->bp, "");
446 printk("\n");
447 450
448 printk(KERN_EMERG "Code: "); 451 printk(KERN_EMERG "Code: ");
449 452
@@ -493,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
493 raw_local_irq_save(flags); 496 raw_local_irq_save(flags);
494 cpu = smp_processor_id(); 497 cpu = smp_processor_id();
495 if (!__raw_spin_trylock(&die_lock)) { 498 if (!__raw_spin_trylock(&die_lock)) {
496 if (cpu == die_owner) 499 if (cpu == die_owner)
497 /* nested oops. should stop eventually */; 500 /* nested oops. should stop eventually */;
498 else 501 else
499 __raw_spin_lock(&die_lock); 502 __raw_spin_lock(&die_lock);
@@ -638,7 +641,7 @@ kernel_trap:
638} 641}
639 642
640#define DO_ERROR(trapnr, signr, str, name) \ 643#define DO_ERROR(trapnr, signr, str, name) \
641asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
642{ \ 645{ \
643 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
644 == NOTIFY_STOP) \ 647 == NOTIFY_STOP) \
@@ -648,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
648} 651}
649 652
650#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
651asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
652{ \ 655{ \
653 siginfo_t info; \ 656 siginfo_t info; \
654 info.si_signo = signr; \ 657 info.si_signo = signr; \
@@ -683,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
683 preempt_conditional_cli(regs); 686 preempt_conditional_cli(regs);
684} 687}
685 688
686asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) 689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
687{ 690{
688 static const char str[] = "double fault"; 691 static const char str[] = "double fault";
689 struct task_struct *tsk = current; 692 struct task_struct *tsk = current;
@@ -778,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
778} 781}
779 782
780static notrace __kprobes void 783static notrace __kprobes void
781unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
782{ 785{
783 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
784 return; 788 return;
785 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
786 reason); 790 reason);
@@ -882,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
882 else if (user_mode(eregs)) 886 else if (user_mode(eregs))
883 regs = task_pt_regs(current); 887 regs = task_pt_regs(current);
884 /* Exception from kernel and interrupts are enabled. Move to 888 /* Exception from kernel and interrupts are enabled. Move to
885 kernel process stack. */ 889 kernel process stack. */
886 else if (eregs->flags & X86_EFLAGS_IF) 890 else if (eregs->flags & X86_EFLAGS_IF)
887 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
888 if (eregs != regs) 892 if (eregs != regs)
@@ -891,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
891} 895}
892 896
893/* runs on IST stack. */ 897/* runs on IST stack. */
894asmlinkage void __kprobes do_debug(struct pt_regs * regs, 898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
895 unsigned long error_code) 899 unsigned long error_code)
896{ 900{
897 struct task_struct *tsk = current; 901 struct task_struct *tsk = current;
@@ -936,7 +940,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
936 tsk->thread.error_code = error_code; 940 tsk->thread.error_code = error_code;
937 info.si_signo = SIGTRAP; 941 info.si_signo = SIGTRAP;
938 info.si_errno = 0; 942 info.si_errno = 0;
939 info.si_code = TRAP_BRKPT; 943 info.si_code = get_si_code(condition);
940 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; 944 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
941 force_sig_info(SIGTRAP, &info, tsk); 945 force_sig_info(SIGTRAP, &info, tsk);
942 946
@@ -1035,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1035 1039
1036asmlinkage void bad_intr(void) 1040asmlinkage void bad_intr(void)
1037{ 1041{
1038 printk("bad interrupt"); 1042 printk("bad interrupt");
1039} 1043}
1040 1044
1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1047,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1047 1051
1048 conditional_sti(regs); 1052 conditional_sti(regs);
1049 if (!user_mode(regs) && 1053 if (!user_mode(regs) &&
1050 kernel_math_error(regs, "kernel simd math error", 19)) 1054 kernel_math_error(regs, "kernel simd math error", 19))
1051 return; 1055 return;
1052 1056
1053 /* 1057 /*
@@ -1092,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1092 force_sig_info(SIGFPE, &info, task); 1096 force_sig_info(SIGFPE, &info, task);
1093} 1097}
1094 1098
1095asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) 1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1096{ 1100{
1097} 1101}
1098 1102
@@ -1131,7 +1135,14 @@ asmlinkage void math_state_restore(void)
1131 } 1135 }
1132 1136
1133 clts(); /* Allow maths ops (or we recurse) */ 1137 clts(); /* Allow maths ops (or we recurse) */
1134 restore_fpu_checking(&me->thread.xstate->fxsave); 1138 /*
1139 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
1140 */
1141 if (unlikely(restore_fpu_checking(me))) {
1142 stts();
1143 force_sig(SIGSEGV, me);
1144 return;
1145 }
1135 task_thread_info(me)->status |= TS_USEDFPU; 1146 task_thread_info(me)->status |= TS_USEDFPU;
1136 me->fpu_counter++; 1147 me->fpu_counter++;
1137} 1148}
@@ -1142,8 +1153,10 @@ void __init trap_init(void)
1142 set_intr_gate(0, &divide_error); 1153 set_intr_gate(0, &divide_error);
1143 set_intr_gate_ist(1, &debug, DEBUG_STACK); 1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1144 set_intr_gate_ist(2, &nmi, NMI_STACK); 1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1145 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ 1156 /* int3 can be called from all */
1146 set_system_gate(4, &overflow); /* int4 can be called from all */ 1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1147 set_intr_gate(5, &bounds); 1160 set_intr_gate(5, &bounds);
1148 set_intr_gate(6, &invalid_op); 1161 set_intr_gate(6, &invalid_op);
1149 set_intr_gate(7, &device_not_available); 1162 set_intr_gate(7, &device_not_available);
@@ -1166,10 +1179,6 @@ void __init trap_init(void)
1166 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 1179 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1167#endif 1180#endif
1168 /* 1181 /*
1169 * initialize the per thread extended state:
1170 */
1171 init_thread_xstate();
1172 /*
1173 * Should be a barrier for any external CPU state: 1182 * Should be a barrier for any external CPU state:
1174 */ 1183 */
1175 cpu_init(); 1184 cpu_init();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
122 return ULLONG_MAX; 122 return ULLONG_MAX;
123} 123}
124 124
125/** 125/*
126 * native_calibrate_tsc - calibrate the tsc on boot 126 * Calculate the TSC frequency from HPET reference
127 */ 127 */
128unsigned long native_calibrate_tsc(void) 128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{ 129{
130 unsigned long flags; 130 u64 tmp;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134 131
135 local_irq_save(flags); 132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
136 148
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
172 * Try to calibrate the TSC against the Programmable
173 * Interrupt Timer and return the frequency of the TSC
174 * in kHz.
175 *
176 * Return ULONG_MAX on failure to calibrate.
177 */
178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
179{
180 u64 tsc, t1, t2, delta;
181 unsigned long tscmin, tscmax;
182 int pitcnt;
138 183
184 /* Set the Gate high, disable speaker */
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 185 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140 186
187 /*
188 * Setup CTC channel 2* for mode 0, (interrupt on terminal
189 * count mode), binary count. Set the latch register to 50ms
190 * (LSB then MSB) to begin countdown.
191 */
141 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
144 tr1 = get_cycles(); 195
145 while ((inb(0x61) & 0x20) == 0); 196 tsc = t1 = t2 = get_cycles();
146 tr2 = get_cycles(); 197
198 pitcnt = 0;
199 tscmax = 0;
200 tscmin = ULONG_MAX;
201 while ((inb(0x61) & 0x20) == 0) {
202 t2 = get_cycles();
203 delta = t2 - tsc;
204 tsc = t2;
205 if ((unsigned long) delta < tscmin)
206 tscmin = (unsigned int) delta;
207 if ((unsigned long) delta > tscmax)
208 tscmax = (unsigned int) delta;
209 pitcnt++;
210 }
211
212 /*
213 * Sanity checks:
214 *
215 * If we were not able to read the PIT more than loopmin
216 * times, then we have been hit by a massive SMI
217 *
218 * If the maximum is 10 times larger than the minimum,
219 * then we got hit by an SMI as well.
220 */
221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
222 return ULONG_MAX;
223
224 /* Calculate the PIT value */
225 delta = t2 - t1;
226 do_div(delta, ms);
227 return delta;
228}
147 229
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
149 268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
347
348/**
349 * native_calibrate_tsc - calibrate the tsc on boot
350 */
351unsigned long native_calibrate_tsc(void)
352{
353 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate;
356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
150 local_irq_restore(flags); 360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
151 363
152 /* 364 /*
153 * Preset the result with the raw and inaccurate PIT 365 * Run 5 calibration loops to get the lowest frequency value
154 * calibration value 366 * (the best estimate). We use two different calibration modes
367 * here:
368 *
369 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
370 * load a timeout of 50ms. We read the time right after we
371 * started the timer and wait until the PIT count down reaches
372 * zero. In each wait loop iteration we read the TSC and check
373 * the delta to the previous read. We keep track of the min
374 * and max values of that delta. The delta is mostly defined
375 * by the IO time of the PIT access, so we can detect when a
376 * SMI/SMM disturbance happend between the two reads. If the
377 * maximum time is significantly larger than the minimum time,
378 * then we discard the result and have another try.
379 *
380 * 2) Reference counter. If available we use the HPET or the
381 * PMTIMER as a reference to check the sanity of that value.
382 * We use separate TSC readouts and check inside of the
383 * reference read for a SMI/SMM disturbance. We dicard
384 * disturbed values here as well. We do that around the PIT
385 * calibration delay loop as we have to wait for a certain
386 * amount of time anyway.
155 */ 387 */
156 delta = (tr2 - tr1); 388
157 do_div(delta, 50); 389 /* Preset PIT loop values */
158 tsc_khz_val = delta; 390 latch = CAL_LATCH;
159 391 ms = CAL_MS;
160 /* hpet or pmtimer available ? */ 392 loopmin = CAL_PIT_LOOPS;
161 if (!hpet && !pm1 && !pm2) { 393
162 printk(KERN_INFO "TSC calibrated against PIT\n"); 394 for (i = 0; i < 3; i++) {
163 goto out; 395 unsigned long tsc_pit_khz;
396
397 /*
398 * Read the start value and the reference count of
399 * hpet/pmtimer when available. Then do the PIT
400 * calibration, which will take at least 50ms, and
401 * read the end value.
402 */
403 local_irq_save(flags);
404 tsc1 = tsc_read_refs(&ref1, hpet);
405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
406 tsc2 = tsc_read_refs(&ref2, hpet);
407 local_irq_restore(flags);
408
409 /* Pick the lowest PIT TSC calibration so far */
410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
411
412 /* hpet or pmtimer available ? */
413 if (!hpet && !ref1 && !ref2)
414 continue;
415
416 /* Check, whether the sampling was disturbed by an SMI */
417 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
418 continue;
419
420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
425
426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
427
428 /* Check the reference deviation */
429 delta = ((u64) tsc_pit_min) * 100;
430 do_div(delta, tsc_ref_min);
431
432 /*
433 * If both calibration results are inside a 10% window
434 * then we can be sure, that the calibration
435 * succeeded. We break out of the loop right away. We
436 * use the reference value, as it is more precise.
437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
443 }
444
445 /*
446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
164 } 456 }
165 457
166 /* Check, whether the sampling was disturbed by an SMI */ 458 /*
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { 459 * Now check the results.
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 460 */
169 "using PIT calibration result\n"); 461 if (tsc_pit_min == ULONG_MAX) {
170 goto out; 462 /* PIT gave no useful value */
463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
464
465 /* We don't have an alternative source, disable TSC */
466 if (!hpet && !ref1 && !ref2) {
467 printk("TSC: No reference (HPET/PMTIMER) available\n");
468 return 0;
469 }
470
471 /* The alternative source failed as well, disable TSC */
472 if (tsc_ref_min == ULONG_MAX) {
473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
474 "failed.\n");
475 return 0;
476 }
477
478 /* Use the alternative source */
479 printk(KERN_INFO "TSC: using %s reference calibration\n",
480 hpet ? "HPET" : "PMTIMER");
481
482 return tsc_ref_min;
171 } 483 }
172 484
173 tsc2 = (tsc2 - tsc1) * 1000000LL; 485 /* We don't have an alternative source, use the PIT calibration value */
174 486 if (!hpet && !ref1 && !ref2) {
175 if (hpet) { 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
176 printk(KERN_INFO "TSC calibrated against HPET\n"); 488 return tsc_pit_min;
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 } 489 }
190 490
191 do_div(tsc2, tsc1); 491 /* The alternative source failed, use the PIT calibration value */
192 tsc_khz_val = tsc2; 492 if (tsc_ref_min == ULONG_MAX) {
493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
494 "Using PIT calibration\n");
495 return tsc_pit_min;
496 }
193 497
194out: 498 /*
195 return tsc_khz_val; 499 * The calibration values differ too much. In doubt, we use
500 * the PIT value as we know that there are PMTIMERs around
501 * running at double speed. At least we let the user know:
502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
506 return tsc_pit_min;
196} 507}
197 508
198
199#ifdef CONFIG_X86_32 509#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */ 510/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void) 511int recalibrate_cpu_khz(void)
@@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
314 mark_tsc_unstable("cpufreq changes"); 624 mark_tsc_unstable("cpufreq changes");
315 } 625 }
316 626
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu); 627 set_cyc2ns_scale(tsc_khz, freq->cpu);
318 628
319 return 0; 629 return 0;
320} 630}
@@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
325 635
326static int __init cpufreq_tsc(void) 636static int __init cpufreq_tsc(void)
327{ 637{
638 if (!cpu_has_tsc)
639 return 0;
640 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
641 return 0;
328 cpufreq_register_notifier(&time_cpufreq_notifier_block, 642 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER); 643 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0; 644 return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 41e01b145c48..61a97e616f70 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
@@ -184,8 +170,6 @@ static int __init visws_get_smp_config(unsigned int early)
184 return 1; 170 return 1;
185} 171}
186 172
187extern unsigned int __cpuinitdata maxcpus;
188
189/* 173/*
190 * The Visual Workstation is Intel MP compliant in the hardware 174 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table). 175 * sense, but it doesn't have a BIOS(-configuration table).
@@ -244,8 +228,8 @@ static int __init visws_find_smp_config(unsigned int reserve)
244 ncpus = CO_CPU_MAX; 228 ncpus = CO_CPU_MAX;
245 } 229 }
246 230
247 if (ncpus > maxcpus) 231 if (ncpus > setup_max_cpus)
248 ncpus = maxcpus; 232 ncpus = setup_max_cpus;
249 233
250#ifdef CONFIG_X86_LOCAL_APIC 234#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1; 235 smp_found_config = 1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
46#include <asm/io.h> 46#include <asm/io.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/irq.h> 48#include <asm/irq.h>
49#include <asm/syscalls.h>
49 50
50/* 51/*
51 * Known problems: 52 * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 0a1b1a9d922d..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc) 235 const void *desc)
235{ 236{
236 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238} 239}
239 240
240static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
392} 393}
393#endif 394#endif
394 395
395static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
396{ 397{
397 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
399} 400}
400 401
401static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
402{ 403{
403 /* 404 /*
404 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
410} 411}
411 412
412static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
413{ 414{
414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
415 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
417} 418}
418 419
419static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
420{ 421{
421 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
423} 424}
424 425
425static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
426{ 427{
427 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
@@ -904,8 +905,8 @@ static inline int __init activate_vmi(void)
904#endif 905#endif
905 906
906#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
909#endif 910#endif
910 911
911 /* 912 /*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
@@ -209,3 +210,11 @@ SECTIONS
209 210
210 DWARF_DEBUG 211 DWARF_DEBUG
211} 212}
213
214#ifdef CONFIG_KEXEC
215/* Link time checks */
216#include <asm/kexec.h>
217
218ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
219 "kexec control code size is too big")
220#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..9abac8a9d823
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3b..ce3251ce5504 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select MMU_NOTIFIER
24 select ANON_INODES 25 select ANON_INODES
25 ---help--- 26 ---help---
26 Support hosting fully virtualized guest machines using hardware 27 Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0e4ddca6c18..3da2508eb22a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -653,6 +653,88 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
653 account_shadowed(kvm, gfn); 653 account_shadowed(kvm, gfn);
654} 654}
655 655
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
657{
658 u64 *spte;
659 int need_tlb_flush = 0;
660
661 while ((spte = rmap_next(kvm, rmapp, NULL))) {
662 BUG_ON(!(*spte & PT_PRESENT_MASK));
663 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
664 rmap_remove(kvm, spte);
665 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
666 need_tlb_flush = 1;
667 }
668 return need_tlb_flush;
669}
670
671static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
672 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
673{
674 int i;
675 int retval = 0;
676
677 /*
678 * If mmap_sem isn't taken, we can look the memslots with only
679 * the mmu_lock by skipping over the slots with userspace_addr == 0.
680 */
681 for (i = 0; i < kvm->nmemslots; i++) {
682 struct kvm_memory_slot *memslot = &kvm->memslots[i];
683 unsigned long start = memslot->userspace_addr;
684 unsigned long end;
685
686 /* mmu_lock protects userspace_addr */
687 if (!start)
688 continue;
689
690 end = start + (memslot->npages << PAGE_SHIFT);
691 if (hva >= start && hva < end) {
692 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
693 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
694 retval |= handler(kvm,
695 &memslot->lpage_info[
696 gfn_offset /
697 KVM_PAGES_PER_HPAGE].rmap_pde);
698 }
699 }
700
701 return retval;
702}
703
704int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
705{
706 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
707}
708
709static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
710{
711 u64 *spte;
712 int young = 0;
713
714 /* always return old for EPT */
715 if (!shadow_accessed_mask)
716 return 0;
717
718 spte = rmap_next(kvm, rmapp, NULL);
719 while (spte) {
720 int _young;
721 u64 _spte = *spte;
722 BUG_ON(!(_spte & PT_PRESENT_MASK));
723 _young = _spte & PT_ACCESSED_MASK;
724 if (_young) {
725 young = 1;
726 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
727 }
728 spte = rmap_next(kvm, rmapp, spte);
729 }
730 return young;
731}
732
733int kvm_age_hva(struct kvm *kvm, unsigned long hva)
734{
735 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
736}
737
656#ifdef MMU_DEBUG 738#ifdef MMU_DEBUG
657static int is_empty_shadow_page(u64 *spt) 739static int is_empty_shadow_page(u64 *spt)
658{ 740{
@@ -1203,6 +1285,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1203 int r; 1285 int r;
1204 int largepage = 0; 1286 int largepage = 0;
1205 pfn_t pfn; 1287 pfn_t pfn;
1288 unsigned long mmu_seq;
1206 1289
1207 down_read(&current->mm->mmap_sem); 1290 down_read(&current->mm->mmap_sem);
1208 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1293,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1210 largepage = 1; 1293 largepage = 1;
1211 } 1294 }
1212 1295
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */
1213 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1298 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1214 up_read(&current->mm->mmap_sem); 1299 up_read(&current->mm->mmap_sem);
1215 1300
@@ -1220,6 +1305,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1220 } 1305 }
1221 1306
1222 spin_lock(&vcpu->kvm->mmu_lock); 1307 spin_lock(&vcpu->kvm->mmu_lock);
1308 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock;
1223 kvm_mmu_free_some_pages(vcpu); 1310 kvm_mmu_free_some_pages(vcpu);
1224 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1225 PT32E_ROOT_LEVEL); 1312 PT32E_ROOT_LEVEL);
@@ -1227,6 +1314,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1227 1314
1228 1315
1229 return r; 1316 return r;
1317
1318out_unlock:
1319 spin_unlock(&vcpu->kvm->mmu_lock);
1320 kvm_release_pfn_clean(pfn);
1321 return 0;
1230} 1322}
1231 1323
1232 1324
@@ -1345,6 +1437,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1345 int r; 1437 int r;
1346 int largepage = 0; 1438 int largepage = 0;
1347 gfn_t gfn = gpa >> PAGE_SHIFT; 1439 gfn_t gfn = gpa >> PAGE_SHIFT;
1440 unsigned long mmu_seq;
1348 1441
1349 ASSERT(vcpu); 1442 ASSERT(vcpu);
1350 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1443 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1451,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1358 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1359 largepage = 1; 1452 largepage = 1;
1360 } 1453 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */
1361 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1456 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1362 up_read(&current->mm->mmap_sem); 1457 up_read(&current->mm->mmap_sem);
1363 if (is_error_pfn(pfn)) { 1458 if (is_error_pfn(pfn)) {
@@ -1365,12 +1460,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1365 return 1; 1460 return 1;
1366 } 1461 }
1367 spin_lock(&vcpu->kvm->mmu_lock); 1462 spin_lock(&vcpu->kvm->mmu_lock);
1463 if (mmu_notifier_retry(vcpu, mmu_seq))
1464 goto out_unlock;
1368 kvm_mmu_free_some_pages(vcpu); 1465 kvm_mmu_free_some_pages(vcpu);
1369 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1370 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1371 spin_unlock(&vcpu->kvm->mmu_lock); 1468 spin_unlock(&vcpu->kvm->mmu_lock);
1372 1469
1373 return r; 1470 return r;
1471
1472out_unlock:
1473 spin_unlock(&vcpu->kvm->mmu_lock);
1474 kvm_release_pfn_clean(pfn);
1475 return 0;
1374} 1476}
1375 1477
1376static void nonpaging_free(struct kvm_vcpu *vcpu) 1478static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1670,6 +1772,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1670 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1671 vcpu->arch.update_pte.largepage = 1; 1773 vcpu->arch.update_pte.largepage = 1;
1672 } 1774 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */
1673 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1777 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1674 up_read(&current->mm->mmap_sem); 1778 up_read(&current->mm->mmap_sem);
1675 1779
@@ -1814,6 +1918,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1814 spin_unlock(&vcpu->kvm->mmu_lock); 1918 spin_unlock(&vcpu->kvm->mmu_lock);
1815 return r; 1919 return r;
1816} 1920}
1921EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1817 1922
1818void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 1923void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1819{ 1924{
@@ -1870,6 +1975,12 @@ void kvm_enable_tdp(void)
1870} 1975}
1871EXPORT_SYMBOL_GPL(kvm_enable_tdp); 1976EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1872 1977
1978void kvm_disable_tdp(void)
1979{
1980 tdp_enabled = false;
1981}
1982EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1983
1873static void free_mmu_pages(struct kvm_vcpu *vcpu) 1984static void free_mmu_pages(struct kvm_vcpu *vcpu)
1874{ 1985{
1875 struct kvm_mmu_page *sp; 1986 struct kvm_mmu_page *sp;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d918220baeb..4a814bff21f2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 263 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 264 if (is_error_pfn(pfn))
265 return; 265 return;
266 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
267 return;
266 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -343,7 +345,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 shadow_addr = __pa(shadow_page->spt); 345 shadow_addr = __pa(shadow_page->spt);
344 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
345 | PT_WRITABLE_MASK | PT_USER_MASK; 347 | PT_WRITABLE_MASK | PT_USER_MASK;
346 *shadow_ent = shadow_pte; 348 set_shadow_pte(shadow_ent, shadow_pte);
347 } 349 }
348 350
349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 382 int r;
381 pfn_t pfn; 383 pfn_t pfn;
382 int largepage = 0; 384 int largepage = 0;
385 unsigned long mmu_seq;
383 386
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 388 kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 416 largepage = 1;
414 } 417 }
415 } 418 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem); 422 up_read(&current->mm->mmap_sem);
418 423
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 429 }
425 430
426 spin_lock(&vcpu->kvm->mmu_lock); 431 spin_lock(&vcpu->kvm->mmu_lock);
432 if (mmu_notifier_retry(vcpu, mmu_seq))
433 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 434 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 436 largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447
441 return write_pt; 448 return write_pt;
449
450out_unlock:
451 spin_unlock(&vcpu->kvm->mmu_lock);
452 kvm_release_pfn_clean(pfn);
453 return 0;
442} 454}
443 455
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b756e876dce3..8233b86c778c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -62,6 +62,7 @@ static int npt = 1;
62module_param(npt, int, S_IRUGO); 62module_param(npt, int, S_IRUGO);
63 63
64static void kvm_reput_irq(struct vcpu_svm *svm); 64static void kvm_reput_irq(struct vcpu_svm *svm);
65static void svm_flush_tlb(struct kvm_vcpu *vcpu);
65 66
66static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 67static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
67{ 68{
@@ -453,7 +454,8 @@ static __init int svm_hardware_setup(void)
453 if (npt_enabled) { 454 if (npt_enabled) {
454 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 455 printk(KERN_INFO "kvm: Nested Paging enabled\n");
455 kvm_enable_tdp(); 456 kvm_enable_tdp();
456 } 457 } else
458 kvm_disable_tdp();
457 459
458 return 0; 460 return 0;
459 461
@@ -877,6 +879,10 @@ set:
877static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 879static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
878{ 880{
879 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 881 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
882 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
883
884 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
885 force_new_asid(vcpu);
880 886
881 vcpu->arch.cr4 = cr4; 887 vcpu->arch.cr4 = cr4;
882 if (!npt_enabled) 888 if (!npt_enabled)
@@ -1007,10 +1013,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1007 struct kvm *kvm = svm->vcpu.kvm; 1013 struct kvm *kvm = svm->vcpu.kvm;
1008 u64 fault_address; 1014 u64 fault_address;
1009 u32 error_code; 1015 u32 error_code;
1016 bool event_injection = false;
1010 1017
1011 if (!irqchip_in_kernel(kvm) && 1018 if (!irqchip_in_kernel(kvm) &&
1012 is_external_interrupt(exit_int_info)) 1019 is_external_interrupt(exit_int_info)) {
1020 event_injection = true;
1013 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1021 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1022 }
1014 1023
1015 fault_address = svm->vmcb->control.exit_info_2; 1024 fault_address = svm->vmcb->control.exit_info_2;
1016 error_code = svm->vmcb->control.exit_info_1; 1025 error_code = svm->vmcb->control.exit_info_1;
@@ -1023,7 +1032,16 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1023 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, 1032 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1024 (u32)fault_address, (u32)(fault_address >> 32), 1033 (u32)fault_address, (u32)(fault_address >> 32),
1025 handler); 1034 handler);
1035 /*
1036 * FIXME: Tis shouldn't be necessary here, but there is a flush
1037 * missing in the MMU code. Until we find this bug, flush the
1038 * complete TLB here on an NPF
1039 */
1040 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu);
1026 1042
1043 if (event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1027 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1028} 1046}
1029 1047
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0cac63701719..7041cc52b562 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2298,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2298 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2300 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2301 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2303 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2302 } 2304 }
2303 2305
@@ -3116,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3116 return ERR_PTR(-ENOMEM); 3118 return ERR_PTR(-ENOMEM);
3117 3119
3118 allocate_vpid(vmx); 3120 allocate_vpid(vmx);
3119 if (id == 0 && vm_need_ept()) {
3120 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3121 VMX_EPT_WRITABLE_MASK |
3122 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3123 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3124 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3125 VMX_EPT_EXECUTABLE_MASK);
3126 kvm_enable_tdp();
3127 }
3128 3121
3129 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3122 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3130 if (err) 3123 if (err)
@@ -3303,8 +3296,16 @@ static int __init vmx_init(void)
3303 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3296 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3304 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3297 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3305 3298
3306 if (cpu_has_vmx_ept()) 3299 if (vm_need_ept()) {
3307 bypass_guest_pf = 0; 3300 bypass_guest_pf = 0;
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3305 VMX_EPT_EXECUTABLE_MASK);
3306 kvm_enable_tdp();
3307 } else
3308 kvm_disable_tdp();
3308 3309
3309 if (bypass_guest_pf) 3310 if (bypass_guest_pf)
3310 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3311 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 425a13436b3f..17e25995b65b 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,21 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_VMX_BASIC 0x480
335#define MSR_IA32_VMX_PINBASED_CTLS 0x481
336#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
337#define MSR_IA32_VMX_EXIT_CTLS 0x483
338#define MSR_IA32_VMX_ENTRY_CTLS 0x484
339#define MSR_IA32_VMX_MISC 0x485
340#define MSR_IA32_VMX_CR0_FIXED0 0x486
341#define MSR_IA32_VMX_CR0_FIXED1 0x487
342#define MSR_IA32_VMX_CR4_FIXED0 0x488
343#define MSR_IA32_VMX_CR4_FIXED1 0x489
344#define MSR_IA32_VMX_VMCS_ENUM 0x48a
345#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
346#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
347
348#define MSR_IA32_FEATURE_CONTROL 0x3a
349#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
350#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
351 336
@@ -370,8 +355,6 @@ enum vmcs_field {
370#define VMX_EPT_READABLE_MASK 0x1ull 355#define VMX_EPT_READABLE_MASK 0x1ull
371#define VMX_EPT_WRITABLE_MASK 0x2ull 356#define VMX_EPT_WRITABLE_MASK 0x2ull
372#define VMX_EPT_EXECUTABLE_MASK 0x4ull 357#define VMX_EPT_EXECUTABLE_MASK 0x4ull
373#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
374#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
375 358
376#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 359#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
377 360
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9f1cdb011cff..0d682fc6aeb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext)
883 case KVM_CAP_PIT: 883 case KVM_CAP_PIT:
884 case KVM_CAP_NOP_IO_DELAY: 884 case KVM_CAP_NOP_IO_DELAY:
885 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
886 case KVM_CAP_SYNC_MMU:
886 r = 1; 887 r = 1;
887 break; 888 break;
888 case KVM_CAP_COALESCED_MMIO: 889 case KVM_CAP_COALESCED_MMIO:
@@ -1495,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1495 goto out; 1496 goto out;
1496 1497
1497 down_write(&kvm->slots_lock); 1498 down_write(&kvm->slots_lock);
1499 spin_lock(&kvm->mmu_lock);
1498 1500
1499 p = &kvm->arch.aliases[alias->slot]; 1501 p = &kvm->arch.aliases[alias->slot];
1500 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1502 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1506,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1506 break; 1508 break;
1507 kvm->arch.naliases = n; 1509 kvm->arch.naliases = n;
1508 1510
1511 spin_unlock(&kvm->mmu_lock);
1509 kvm_mmu_zap_all(kvm); 1512 kvm_mmu_zap_all(kvm);
1510 1513
1511 up_write(&kvm->slots_lock); 1514 up_write(&kvm->slots_lock);
@@ -3184,6 +3187,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3184 kvm_desct->base |= seg_desc->base2 << 24; 3187 kvm_desct->base |= seg_desc->base2 << 24;
3185 kvm_desct->limit = seg_desc->limit0; 3188 kvm_desct->limit = seg_desc->limit0;
3186 kvm_desct->limit |= seg_desc->limit << 16; 3189 kvm_desct->limit |= seg_desc->limit << 16;
3190 if (seg_desc->g) {
3191 kvm_desct->limit <<= 12;
3192 kvm_desct->limit |= 0xfff;
3193 }
3187 kvm_desct->selector = selector; 3194 kvm_desct->selector = selector;
3188 kvm_desct->type = seg_desc->type; 3195 kvm_desct->type = seg_desc->type;
3189 kvm_desct->present = seg_desc->p; 3196 kvm_desct->present = seg_desc->p;
@@ -3223,6 +3230,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3223static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3230static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3231 struct desc_struct *seg_desc)
3225{ 3232{
3233 gpa_t gpa;
3226 struct descriptor_table dtable; 3234 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3235 u16 index = selector >> 3;
3228 3236
@@ -3232,13 +3240,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3232 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3240 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3233 return 1; 3241 return 1;
3234 } 3242 }
3235 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3243 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3244 gpa += index * 8;
3245 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3236} 3246}
3237 3247
3238/* allowed just for 8 bytes segments */ 3248/* allowed just for 8 bytes segments */
3239static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3249static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3240 struct desc_struct *seg_desc) 3250 struct desc_struct *seg_desc)
3241{ 3251{
3252 gpa_t gpa;
3242 struct descriptor_table dtable; 3253 struct descriptor_table dtable;
3243 u16 index = selector >> 3; 3254 u16 index = selector >> 3;
3244 3255
@@ -3246,7 +3257,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3246 3257
3247 if (dtable.limit < index * 8 + 7) 3258 if (dtable.limit < index * 8 + 7)
3248 return 1; 3259 return 1;
3249 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3260 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3261 gpa += index * 8;
3262 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3250} 3263}
3251 3264
3252static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3265static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3258,55 +3271,7 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3258 base_addr |= (seg_desc->base1 << 16); 3271 base_addr |= (seg_desc->base1 << 16);
3259 base_addr |= (seg_desc->base2 << 24); 3272 base_addr |= (seg_desc->base2 << 24);
3260 3273
3261 return base_addr; 3274 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3262}
3263
3264static int load_tss_segment32(struct kvm_vcpu *vcpu,
3265 struct desc_struct *seg_desc,
3266 struct tss_segment_32 *tss)
3267{
3268 u32 base_addr;
3269
3270 base_addr = get_tss_base_addr(vcpu, seg_desc);
3271
3272 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3273 sizeof(struct tss_segment_32));
3274}
3275
3276static int save_tss_segment32(struct kvm_vcpu *vcpu,
3277 struct desc_struct *seg_desc,
3278 struct tss_segment_32 *tss)
3279{
3280 u32 base_addr;
3281
3282 base_addr = get_tss_base_addr(vcpu, seg_desc);
3283
3284 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3285 sizeof(struct tss_segment_32));
3286}
3287
3288static int load_tss_segment16(struct kvm_vcpu *vcpu,
3289 struct desc_struct *seg_desc,
3290 struct tss_segment_16 *tss)
3291{
3292 u32 base_addr;
3293
3294 base_addr = get_tss_base_addr(vcpu, seg_desc);
3295
3296 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3297 sizeof(struct tss_segment_16));
3298}
3299
3300static int save_tss_segment16(struct kvm_vcpu *vcpu,
3301 struct desc_struct *seg_desc,
3302 struct tss_segment_16 *tss)
3303{
3304 u32 base_addr;
3305
3306 base_addr = get_tss_base_addr(vcpu, seg_desc);
3307
3308 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3309 sizeof(struct tss_segment_16));
3310} 3275}
3311 3276
3312static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3277static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -3466,20 +3431,26 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3466} 3431}
3467 3432
3468static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3433static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3469 struct desc_struct *cseg_desc, 3434 u32 old_tss_base,
3470 struct desc_struct *nseg_desc) 3435 struct desc_struct *nseg_desc)
3471{ 3436{
3472 struct tss_segment_16 tss_segment_16; 3437 struct tss_segment_16 tss_segment_16;
3473 int ret = 0; 3438 int ret = 0;
3474 3439
3475 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3440 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3441 sizeof tss_segment_16))
3476 goto out; 3442 goto out;
3477 3443
3478 save_state_to_tss16(vcpu, &tss_segment_16); 3444 save_state_to_tss16(vcpu, &tss_segment_16);
3479 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3480 3445
3481 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3446 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3447 sizeof tss_segment_16))
3448 goto out;
3449
3450 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3451 &tss_segment_16, sizeof tss_segment_16))
3482 goto out; 3452 goto out;
3453
3483 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3454 if (load_state_from_tss16(vcpu, &tss_segment_16))
3484 goto out; 3455 goto out;
3485 3456
@@ -3489,20 +3460,26 @@ out:
3489} 3460}
3490 3461
3491static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3462static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3492 struct desc_struct *cseg_desc, 3463 u32 old_tss_base,
3493 struct desc_struct *nseg_desc) 3464 struct desc_struct *nseg_desc)
3494{ 3465{
3495 struct tss_segment_32 tss_segment_32; 3466 struct tss_segment_32 tss_segment_32;
3496 int ret = 0; 3467 int ret = 0;
3497 3468
3498 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3469 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3470 sizeof tss_segment_32))
3499 goto out; 3471 goto out;
3500 3472
3501 save_state_to_tss32(vcpu, &tss_segment_32); 3473 save_state_to_tss32(vcpu, &tss_segment_32);
3502 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3503 3474
3504 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3475 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3476 sizeof tss_segment_32))
3505 goto out; 3477 goto out;
3478
3479 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3480 &tss_segment_32, sizeof tss_segment_32))
3481 goto out;
3482
3506 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3483 if (load_state_from_tss32(vcpu, &tss_segment_32))
3507 goto out; 3484 goto out;
3508 3485
@@ -3517,16 +3494,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3517 struct desc_struct cseg_desc; 3494 struct desc_struct cseg_desc;
3518 struct desc_struct nseg_desc; 3495 struct desc_struct nseg_desc;
3519 int ret = 0; 3496 int ret = 0;
3497 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3498 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3520 3499
3521 kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3500 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3522 3501
3502 /* FIXME: Handle errors. Failure to read either TSS or their
3503 * descriptors should generate a pagefault.
3504 */
3523 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3505 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3524 goto out; 3506 goto out;
3525 3507
3526 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3508 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3527 goto out; 3509 goto out;
3528 3510
3529
3530 if (reason != TASK_SWITCH_IRET) { 3511 if (reason != TASK_SWITCH_IRET) {
3531 int cpl; 3512 int cpl;
3532 3513
@@ -3544,8 +3525,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3544 3525
3545 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3526 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3546 cseg_desc.type &= ~(1 << 1); //clear the B flag 3527 cseg_desc.type &= ~(1 << 1); //clear the B flag
3547 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3528 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3548 &cseg_desc);
3549 } 3529 }
3550 3530
3551 if (reason == TASK_SWITCH_IRET) { 3531 if (reason == TASK_SWITCH_IRET) {
@@ -3557,10 +3537,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3557 kvm_x86_ops->cache_regs(vcpu); 3537 kvm_x86_ops->cache_regs(vcpu);
3558 3538
3559 if (nseg_desc.type & 8) 3539 if (nseg_desc.type & 8)
3560 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3561 &nseg_desc); 3541 &nseg_desc);
3562 else 3542 else
3563 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3543 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3564 &nseg_desc); 3544 &nseg_desc);
3565 3545
3566 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3546 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3995,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3995 */ 3975 */
3996 if (!user_alloc) { 3976 if (!user_alloc) {
3997 if (npages && !old.rmap) { 3977 if (npages && !old.rmap) {
3978 unsigned long userspace_addr;
3979
3998 down_write(&current->mm->mmap_sem); 3980 down_write(&current->mm->mmap_sem);
3999 memslot->userspace_addr = do_mmap(NULL, 0, 3981 userspace_addr = do_mmap(NULL, 0,
4000 npages * PAGE_SIZE, 3982 npages * PAGE_SIZE,
4001 PROT_READ | PROT_WRITE, 3983 PROT_READ | PROT_WRITE,
4002 MAP_SHARED | MAP_ANONYMOUS, 3984 MAP_SHARED | MAP_ANONYMOUS,
4003 0); 3985 0);
4004 up_write(&current->mm->mmap_sem); 3986 up_write(&current->mm->mmap_sem);
4005 3987
4006 if (IS_ERR((void *)memslot->userspace_addr)) 3988 if (IS_ERR((void *)userspace_addr))
4007 return PTR_ERR((void *)memslot->userspace_addr); 3989 return PTR_ERR((void *)userspace_addr);
3990
3991 /* set userspace_addr atomically for kvm_hva_to_rmapp */
3992 spin_lock(&kvm->mmu_lock);
3993 memslot->userspace_addr = userspace_addr;
3994 spin_unlock(&kvm->mmu_lock);
4008 } else { 3995 } else {
4009 if (!old.user_alloc && old.rmap) { 3996 if (!old.user_alloc && old.rmap) {
4010 int ret; 3997 int ret;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 0313a5eec412..65f0b8a47bed 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
55#include <linux/lguest_launcher.h> 55#include <linux/lguest_launcher.h>
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <asm/apic.h>
58#include <asm/lguest.h> 59#include <asm/lguest.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
783 * code qualifies for Advanced. It will also never interrupt anything. It 784 * code qualifies for Advanced. It will also never interrupt anything. It
784 * does, however, allow us to get through the Linux boot code. */ 785 * does, however, allow us to get through the Linux boot code. */
785#ifdef CONFIG_X86_LOCAL_APIC 786#ifdef CONFIG_X86_LOCAL_APIC
786static void lguest_apic_write(unsigned long reg, u32 v) 787static void lguest_apic_write(u32 reg, u32 v)
787{ 788{
788} 789}
789 790
790static u32 lguest_apic_read(unsigned long reg) 791static u32 lguest_apic_read(u32 reg)
791{ 792{
792 return 0; 793 return 0;
793} 794}
795
796static u64 lguest_apic_icr_read(void)
797{
798 return 0;
799}
800
801static void lguest_apic_icr_write(u32 low, u32 id)
802{
803 /* Warn to see if there's any stray references */
804 WARN_ON(1);
805}
806
807static void lguest_apic_wait_icr_idle(void)
808{
809 return;
810}
811
812static u32 lguest_apic_safe_wait_icr_idle(void)
813{
814 return 0;
815}
816
817static struct apic_ops lguest_basic_apic_ops = {
818 .read = lguest_apic_read,
819 .write = lguest_apic_write,
820 .icr_read = lguest_apic_icr_read,
821 .icr_write = lguest_apic_icr_write,
822 .wait_icr_idle = lguest_apic_wait_icr_idle,
823 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
824};
794#endif 825#endif
795 826
796/* STOP! Until an interrupt comes in. */ 827/* STOP! Until an interrupt comes in. */
@@ -990,8 +1021,7 @@ __init void lguest_init(void)
990 1021
991#ifdef CONFIG_X86_LOCAL_APIC 1022#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 1023 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 1024 apic_ops = &lguest_basic_apic_ops;
994 pv_apic_ops.apic_read = lguest_apic_read;
995#endif 1025#endif
996 1026
997 /* time operations */ 1027 /* time operations */
@@ -1014,6 +1044,9 @@ __init void lguest_init(void)
1014 init_pg_tables_start = __pa(pg0); 1044 init_pg_tables_start = __pa(pg0);
1015 init_pg_tables_end = __pa(pg0); 1045 init_pg_tables_end = __pa(pg0);
1016 1046
1047 /* As described in head_32.S, we map the first 128M of memory. */
1048 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1049
1017 /* Load the %fs segment register (the per-cpu segment register) with 1050 /* Load the %fs segment register (the per-cpu segment register) with
1018 * the normal data segment to get through booting. */ 1051 * the normal data segment to get through booting. */
1019 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1052 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa4119424..55e11aa6d66c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
18else 18else
19 obj-y += io_64.o iomap_copy_64.o 19 obj-y += io_64.o iomap_copy_64.o
20
21 CFLAGS_csum-partial_64.o := -funroll-loops
22
23 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 20 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
24 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 21 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
25 lib-y += memmove_64.o memset_64.o 22 lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dfdf428975c0..f118c110af32 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -52,7 +52,7 @@
52 jnz 100b 52 jnz 100b
53102: 53102:
54 .section .fixup,"ax" 54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */ 55103: addl %ecx,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail 56 jmp copy_user_handle_tail
57 .previous 57 .previous
58 58
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 40e0e309d27e..cb0c112386fb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -32,7 +32,7 @@
32 jnz 100b 32 jnz 100b
33102: 33102:
34 .section .fixup,"ax" 34 .section .fixup,"ax"
35103: addl %r8d,%edx /* ecx is zerorest also */ 35103: addl %ecx,%edx /* ecx is zerorest also */
36 jmp copy_user_handle_tail 36 jmp copy_user_handle_tail
37 .previous 37 .previous
38 38
@@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache)
108 jmp 60f 108 jmp 60f
10950: movl %ecx,%edx 10950: movl %ecx,%edx
11060: sfence 11060: sfence
111 movl %r8d,%ecx
112 jmp copy_user_handle_tail 111 jmp copy_user_handle_tail
113 .previous 112 .previous
114 113
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index d5a2b39f882b..321cf720dbb6 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,36 +16,46 @@ static void __rdmsr_on_cpu(void *info)
16 rdmsr(rv->msr_no, rv->l, rv->h); 16 rdmsr(rv->msr_no, rv->l, rv->h);
17} 17}
18 18
19static void __rdmsr_safe_on_cpu(void *info) 19static void __wrmsr_on_cpu(void *info)
20{ 20{
21 struct msr_info *rv = info; 21 struct msr_info *rv = info;
22 22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); 23 wrmsr(rv->msr_no, rv->l, rv->h);
24} 24}
25 25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) 26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{ 27{
28 int err = 0; 28 int err;
29 struct msr_info rv; 29 struct msr_info rv;
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
34 err = rv.err;
35 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
37 }
38 *l = rv.l; 33 *l = rv.l;
39 *h = rv.h; 34 *h = rv.h;
40 35
41 return err; 36 return err;
42} 37}
43 38
44static void __wrmsr_on_cpu(void *info) 39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
45{ 55{
46 struct msr_info *rv = info; 56 struct msr_info *rv = info;
47 57
48 wrmsr(rv->msr_no, rv->l, rv->h); 58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
49} 59}
50 60
51static void __wrmsr_safe_on_cpu(void *info) 61static void __wrmsr_safe_on_cpu(void *info)
@@ -55,44 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
55 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); 65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
56} 66}
57 67
58static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) 68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
59{ 69{
60 int err = 0; 70 int err;
61 struct msr_info rv; 71 struct msr_info rv;
62 72
63 rv.msr_no = msr_no; 73 rv.msr_no = msr_no;
64 rv.l = l; 74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
65 rv.h = h; 75 *l = rv.l;
66 if (safe) { 76 *h = rv.h;
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
68 err = rv.err;
69 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
71 }
72
73 return err;
74}
75
76void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
77{
78 _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
79}
80 77
81void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 78 return err ? err : rv.err;
82{
83 _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
84} 79}
85 80
86/* These "safe" variants are slower and should be used when the target MSR
87 may not actually exist. */
88int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
89{ 82{
90 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); 83 int err;
91} 84 struct msr_info rv;
92 85
93int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 86 rv.msr_no = msr_no;
94{ 87 rv.l = l;
95 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); 88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
96} 92}
97 93
98EXPORT_SYMBOL(rdmsr_on_cpu); 94EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094d..82004d2bf05e 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
22 "testb %%al,%%al\n\t" 22 "testb %%al,%%al\n\t"
23 "jne 1b" 23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2) 24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src), "1" (dest) : "memory"); 25 : "0" (src), "1" (dest) : "memory");
26 return dest; 26 return dest;
27} 27}
28EXPORT_SYMBOL(strcpy); 28EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
42 "stosb\n" 42 "stosb\n"
43 "2:" 43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) 44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src), "1" (dest), "2" (count) : "memory"); 45 : "0" (src), "1" (dest), "2" (count) : "memory");
46 return dest; 46 return dest;
47} 47}
48EXPORT_SYMBOL(strncpy); 48EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
60 "testb %%al,%%al\n\t" 60 "testb %%al,%%al\n\t"
61 "jne 1b" 61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) 62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); 63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
64 return dest; 64 return dest;
65} 65}
66EXPORT_SYMBOL(strcat); 66EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
105 "2:\tsbbl %%eax,%%eax\n\t" 105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n" 106 "orb $1,%%al\n"
107 "3:" 107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1) 108 : "=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs), "2" (ct) 109 : "1" (cs), "2" (ct)
110 :"memory"); 110 : "memory");
111 return res; 111 return res;
112} 112}
113EXPORT_SYMBOL(strcmp); 113EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
130 "3:\tsbbl %%eax,%%eax\n\t" 130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n" 131 "orb $1,%%al\n"
132 "4:" 132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) 133 : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs), "2" (ct), "3" (count) 134 : "1" (cs), "2" (ct), "3" (count)
135 :"memory"); 135 : "memory");
136 return res; 136 return res;
137} 137}
138EXPORT_SYMBOL(strncmp); 138EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
152 "movl $1,%1\n" 152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t" 153 "2:\tmovl %1,%0\n\t"
154 "decl %0" 154 "decl %0"
155 :"=a" (res), "=&S" (d0) 155 : "=a" (res), "=&S" (d0)
156 :"1" (s), "0" (c) 156 : "1" (s), "0" (c)
157 :"memory"); 157 : "memory");
158 return res; 158 return res;
159} 159}
160EXPORT_SYMBOL(strchr); 160EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
169 "scasb\n\t" 169 "scasb\n\t"
170 "notl %0\n\t" 170 "notl %0\n\t"
171 "decl %0" 171 "decl %0"
172 :"=c" (res), "=&D" (d0) 172 : "=c" (res), "=&D" (d0)
173 :"1" (s), "a" (0), "0" (0xffffffffu) 173 : "1" (s), "a" (0), "0" (0xffffffffu)
174 :"memory"); 174 : "memory");
175 return res; 175 return res;
176} 176}
177EXPORT_SYMBOL(strlen); 177EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
189 "je 1f\n\t" 189 "je 1f\n\t"
190 "movl $1,%0\n" 190 "movl $1,%0\n"
191 "1:\tdecl %0" 191 "1:\tdecl %0"
192 :"=D" (res), "=&c" (d0) 192 : "=D" (res), "=&c" (d0)
193 :"a" (c), "0" (cs), "1" (count) 193 : "a" (c), "0" (cs), "1" (count)
194 :"memory"); 194 : "memory");
195 return res; 195 return res;
196} 196}
197EXPORT_SYMBOL(memchr); 197EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
228 "cmpl $-1,%1\n\t" 228 "cmpl $-1,%1\n\t"
229 "jne 1b\n" 229 "jne 1b\n"
230 "3:\tsubl %2,%0" 230 "3:\tsubl %2,%0"
231 :"=a" (res), "=&d" (d0) 231 : "=a" (res), "=&d" (d0)
232 :"c" (s), "1" (count) 232 : "c" (s), "1" (count)
233 :"memory"); 233 : "memory");
234 return res; 234 return res;
235} 235}
236EXPORT_SYMBOL(strnlen); 236EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f3..8e2d55f754bf 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
23 "jne 1b\n\t" 23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t" 24 "xorl %%eax,%%eax\n\t"
25 "2:" 25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1) 26 : "=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) 27 : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di"); 28 : "dx", "di");
29return __res; 29return __res;
30} 30}
31 31
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 24e60944971a..9e68075544f6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/mmx.h> 15#include <asm/mmx.h>
16 16
17#ifdef CONFIG_X86_INTEL_USERCOPY
18/*
19 * Alignment at which movsl is preferred for bulk memory copies.
20 */
21struct movsl_mask movsl_mask __read_mostly;
22#endif
23
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) 24static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{ 25{
19#ifdef CONFIG_X86_INTEL_USERCOPY 26#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 3d317836be9e..37b9ae4d44c5 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,13 +10,15 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13#include <mach_ipi.h>
14
13#ifdef CONFIG_HOTPLUG_CPU 15#ifdef CONFIG_HOTPLUG_CPU
14#define DEFAULT_SEND_IPI (1) 16#define DEFAULT_SEND_IPI (1)
15#else 17#else
16#define DEFAULT_SEND_IPI (0) 18#define DEFAULT_SEND_IPI (0)
17#endif 19#endif
18 20
19int no_broadcast=DEFAULT_SEND_IPI; 21int no_broadcast = DEFAULT_SEND_IPI;
20 22
21/** 23/**
22 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors 24 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
@@ -36,15 +38,6 @@ void __init pre_intr_init_hook(void)
36 init_ISA_irqs(); 38 init_ISA_irqs();
37} 39}
38 40
39/*
40 * IRQ2 is cascade interrupt to second interrupt controller
41 */
42static struct irqaction irq2 = {
43 .handler = no_action,
44 .mask = CPU_MASK_NONE,
45 .name = "cascade",
46};
47
48/** 41/**
49 * intr_init_hook - post gate setup interrupt initialisation 42 * intr_init_hook - post gate setup interrupt initialisation
50 * 43 *
@@ -60,12 +53,6 @@ void __init intr_init_hook(void)
60 if (x86_quirks->arch_intr_init()) 53 if (x86_quirks->arch_intr_init())
61 return; 54 return;
62 } 55 }
63#ifdef CONFIG_X86_LOCAL_APIC
64 apic_intr_init();
65#endif
66
67 if (!acpi_ioapic)
68 setup_irq(2, &irq2);
69} 56}
70 57
71/** 58/**
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43b62fc..000000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa0..000000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd7803a1d5..6730f4e7c744 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o 9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o 10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o 11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d771714559..df37fc9d6a26 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
5#define APIC_DEFINITION 1 5#define APIC_DEFINITION 1
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h> 8#include <asm/mpspec.h>
10#include <asm/genapic.h> 9#include <asm/genapic.h>
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/apicdef.h> 11#include <asm/apicdef.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <linux/dmi.h> 14#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h> 15#include <asm/bigsmp/apicdef.h>
18#include <asm/mach-bigsmp/mach_apicdef.h> 16#include <linux/smp.h>
19#include <asm/mach-bigsmp/mach_ipi.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
20#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
21 20
22static int dmi_bigsmp; /* can be set by dmi scanners */ 21static int dmi_bigsmp; /* can be set by dmi scanners */
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c4..520cca0ee04e 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h> 14#include <asm/es7000/apicdef.h>
17#include <asm/mach-es7000/mach_apic.h> 15#include <linux/smp.h>
18#include <asm/mach-es7000/mach_ipi.h> 16#include <asm/es7000/apic.h>
19#include <asm/mach-es7000/mach_mpparse.h> 17#include <asm/es7000/ipi.h>
20#include <asm/mach-es7000/mach_wakecpu.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h>
21 20
22static int probe_es7000(void) 21static int probe_es7000(void)
23{ 22{
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68764c4..8cf58394975e 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
@@ -12,11 +11,12 @@
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h> 14#include <asm/numaq/apicdef.h>
16#include <asm/mach-numaq/mach_apicdef.h> 15#include <linux/smp.h>
17#include <asm/mach-numaq/mach_ipi.h> 16#include <asm/numaq/apic.h>
18#include <asm/mach-numaq/mach_mpparse.h> 17#include <asm/numaq/ipi.h>
19#include <asm/mach-numaq/mach_wakecpu.h> 18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h> 20#include <asm/numaq.h>
21 21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem, 22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1e..6ad6b67a723d 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h> 14#include <asm/summit/apicdef.h>
17#include <asm/mach-summit/mach_apicdef.h> 15#include <linux/smp.h>
18#include <asm/mach-summit/mach_ipi.h> 16#include <asm/summit/apic.h>
19#include <asm/mach-summit/mach_mpparse.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
20 19
21static int probe_summit(void) 20static int probe_summit(void)
22{ 21{
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
index a037041817c7..4f4e50c3ad3b 100644
--- a/arch/x86/mach-rdc321x/platform.c
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -25,7 +25,6 @@
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/device.h> 26#include <linux/device.h>
27#include <linux/platform_device.h> 27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h> 28#include <linux/leds.h>
30 29
31#include <asm/gpio.h> 30#include <asm/gpio.h>
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..199a5f4a873c 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
448 448
449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
450 450
451 notify_cpu_starting(cpuid);
452
451 /* enable interrupts */ 453 /* enable interrupts */
452 local_irq_enable(); 454 local_irq_enable();
453 455
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7a..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 62fa440678d8..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
328 328
329 get_memcfg_numa(); 329 get_memcfg_numa();
330 330
331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 332
333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do { 334 do {
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4e..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b42..a742d753d5b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
38 39
39/* 40/*
40 * Page fault error code bits 41 * Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
357 return 0; 358 return 0;
358} 359}
359 360
360void do_invalid_op(struct pt_regs *, unsigned long);
361
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{ 362{
364#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -915,15 +914,15 @@ LIST_HEAD(pgd_list);
915 914
916void vmalloc_sync_all(void) 915void vmalloc_sync_all(void)
917{ 916{
918#ifdef CONFIG_X86_32
919 unsigned long start = VMALLOC_START & PGDIR_MASK;
920 unsigned long address; 917 unsigned long address;
921 918
919#ifdef CONFIG_X86_32
922 if (SHARED_KERNEL_PMD) 920 if (SHARED_KERNEL_PMD)
923 return; 921 return;
924 922
925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 923 for (address = VMALLOC_START & PMD_MASK;
926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 924 address >= TASK_SIZE && address < FIXADDR_TOP;
925 address += PMD_SIZE) {
927 unsigned long flags; 926 unsigned long flags;
928 struct page *page; 927 struct page *page;
929 928
@@ -936,10 +935,8 @@ void vmalloc_sync_all(void)
936 spin_unlock_irqrestore(&pgd_lock, flags); 935 spin_unlock_irqrestore(&pgd_lock, flags);
937 } 936 }
938#else /* CONFIG_X86_64 */ 937#else /* CONFIG_X86_64 */
939 unsigned long start = VMALLOC_START & PGDIR_MASK; 938 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
940 unsigned long address; 939 address += PGDIR_SIZE) {
941
942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
943 const pgd_t *pgd_ref = pgd_offset_k(address); 940 const pgd_t *pgd_ref = pgd_offset_k(address);
944 unsigned long flags; 941 unsigned long flags;
945 struct page *page; 942 struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d37f29376b0c..bbe044dbe014 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
194 pgd_t *pgd; 196 pgd_t *pgd;
195 pmd_t *pmd; 197 pmd_t *pmd;
196 pte_t *pte; 198 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0; 199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
201
202 /*
203 * First iteration will setup identity mapping using large/small pages
204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
198 217
199 if (!cpu_has_pse) 218 if (!cpu_has_pse)
200 use_pse = 0; 219 use_pse = 0;
201 220
221repeat:
222 pages_2m = pages_4k = 0;
202 pfn = start_pfn; 223 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx; 225 pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
224 if (use_pse) { 245 if (use_pse) {
225 unsigned int addr2; 246 unsigned int addr2;
226 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
227 255
228 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
229 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
233 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
234 262
235 pages_2m++; 263 pages_2m++;
236 set_pmd(pmd, pfn_pmd(pfn, prot)); 264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
237 268
238 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
239 continue; 270 continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
247 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
248 284
249 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
250 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
251 287
252 pages_4k++; 288 pages_4k++;
253 set_pte(pte, pfn_pte(pfn, prot)); 289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
254 } 293 }
255 } 294 }
256 } 295 }
257 update_page_count(PG_LEVEL_2M, pages_2m); 296 if (mapping_iter == 1) {
258 update_page_count(PG_LEVEL_4K, pages_4k); 297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
303
304 /*
305 * local global flush tlb, which will flush the previous
306 * mappings present in both small and large page TLB's.
307 */
308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
259} 316}
260 317
261/* 318/*
@@ -458,11 +515,7 @@ static void __init pagetable_init(void)
458{ 515{
459 pgd_t *pgd_base = swapper_pg_dir; 516 pgd_t *pgd_base = swapper_pg_dir;
460 517
461 paravirt_pagetable_setup_start(pgd_base);
462
463 permanent_kmaps_init(pgd_base); 518 permanent_kmaps_init(pgd_base);
464
465 paravirt_pagetable_setup_done(pgd_base);
466} 519}
467 520
468#ifdef CONFIG_ACPI_SLEEP 521#ifdef CONFIG_ACPI_SLEEP
@@ -722,7 +775,7 @@ void __init setup_bootmem_allocator(void)
722 after_init_bootmem = 1; 775 after_init_bootmem = 1;
723} 776}
724 777
725static void __init find_early_table_space(unsigned long end) 778static void __init find_early_table_space(unsigned long end, int use_pse)
726{ 779{
727 unsigned long puds, pmds, ptes, tables, start; 780 unsigned long puds, pmds, ptes, tables, start;
728 781
@@ -732,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
732 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
733 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
734 787
735 if (cpu_has_pse) { 788 if (use_pse) {
736 unsigned long extra; 789 unsigned long extra;
737 790
738 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -772,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
772 pgd_t *pgd_base = swapper_pg_dir; 825 pgd_t *pgd_base = swapper_pg_dir;
773 unsigned long start_pfn, end_pfn; 826 unsigned long start_pfn, end_pfn;
774 unsigned long big_page_start; 827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
775 838
776 /* 839 /*
777 * Find space for the kernel direct mapping tables. 840 * Find space for the kernel direct mapping tables.
778 */ 841 */
779 if (!after_init_bootmem) 842 if (!after_init_bootmem)
780 find_early_table_space(end); 843 find_early_table_space(end, use_pse);
781 844
782#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
783 set_nx(); 846 set_nx();
@@ -823,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
823 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
824 if (start_pfn < end_pfn) 887 if (start_pfn < end_pfn)
825 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
826 cpu_has_pse); 889 use_pse);
827 890
828 /* tail is not big page alignment ? */ 891 /* tail is not big page alignment ? */
829 start_pfn = end_pfn; 892 start_pfn = end_pfn;
@@ -907,6 +970,8 @@ void __init mem_init(void)
907 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
908 int tmp; 971 int tmp;
909 972
973 start_periodic_check_for_corruption();
974
910#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
911 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
912#endif 977#endif
@@ -986,7 +1051,6 @@ void __init mem_init(void)
986 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
987 test_wp_bit(); 1052 test_wp_bit();
988 1053
989 cpa_init();
990 save_pg_dir(); 1054 save_pg_dir();
991 zap_low_mappings(); 1055 zap_low_mappings();
992} 1056}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec37121f6709..3e10054c5731 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -60,7 +61,7 @@ static unsigned long dma_reserve __initdata;
60 61
61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 63
63int direct_gbpages __meminitdata 64int direct_gbpages
64#ifdef CONFIG_DIRECT_GBPAGES 65#ifdef CONFIG_DIRECT_GBPAGES
65 = 1 66 = 1
66#endif 67#endif
@@ -86,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on);
86 * around without checking the pgd every time. 87 * around without checking the pgd every time.
87 */ 88 */
88 89
89void show_mem(void) 90int after_bootmem;
90{
91 long i, total = 0, reserved = 0;
92 long shared = 0, cached = 0;
93 struct page *page;
94 pg_data_t *pgdat;
95
96 printk(KERN_INFO "Mem-info:\n");
97 show_free_areas();
98 for_each_online_pgdat(pgdat) {
99 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
100 /*
101 * This loop can take a while with 256 GB and
102 * 4k pages so defer the NMI watchdog:
103 */
104 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
105 touch_nmi_watchdog();
106 91
107 if (!pfn_valid(pgdat->node_start_pfn + i)) 92unsigned long __supported_pte_mask __read_mostly = ~0UL;
108 continue; 93EXPORT_SYMBOL_GPL(__supported_pte_mask);
109 94
110 page = pfn_to_page(pgdat->node_start_pfn + i); 95static int do_not_nx __cpuinitdata;
111 total++; 96
112 if (PageReserved(page)) 97/*
113 reserved++; 98 * noexec=on|off
114 else if (PageSwapCache(page)) 99 * Control non-executable mappings for 64-bit processes.
115 cached++; 100 *
116 else if (page_count(page)) 101 * on Enable (default)
117 shared += page_count(page) - 1; 102 * off Disable
118 } 103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
119 } 114 }
120 printk(KERN_INFO "%lu pages of RAM\n", total); 115 return 0;
121 printk(KERN_INFO "%lu reserved pages\n", reserved);
122 printk(KERN_INFO "%lu pages shared\n", shared);
123 printk(KERN_INFO "%lu pages swap cached\n", cached);
124} 116}
117early_param("noexec", nonx_setup);
125 118
126int after_bootmem; 119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127 127
128static __init void *spp_getpage(void) 128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
148/*
149 * NOTE: This function is marked __ref because it calls __init function
150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
151 */
152static __ref void *spp_getpage(void)
129{ 153{
130 void *ptr; 154 void *ptr;
131 155
@@ -258,7 +282,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
258void __init cleanup_highmap(void) 282void __init cleanup_highmap(void)
259{ 283{
260 unsigned long vaddr = __START_KERNEL_map; 284 unsigned long vaddr = __START_KERNEL_map;
261 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 285 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
262 pmd_t *pmd = level2_kernel_pgt; 286 pmd_t *pmd = level2_kernel_pgt;
263 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 287 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
264 288
@@ -274,7 +298,7 @@ static unsigned long __initdata table_start;
274static unsigned long __meminitdata table_end; 298static unsigned long __meminitdata table_end;
275static unsigned long __meminitdata table_top; 299static unsigned long __meminitdata table_top;
276 300
277static __meminit void *alloc_low_page(unsigned long *phys) 301static __ref void *alloc_low_page(unsigned long *phys)
278{ 302{
279 unsigned long pfn = table_end++; 303 unsigned long pfn = table_end++;
280 void *adr; 304 void *adr;
@@ -295,7 +319,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
295 return adr; 319 return adr;
296} 320}
297 321
298static __meminit void unmap_low_page(void *adr) 322static __ref void unmap_low_page(void *adr)
299{ 323{
300 if (after_bootmem) 324 if (after_bootmem)
301 return; 325 return;
@@ -304,7 +328,8 @@ static __meminit void unmap_low_page(void *adr)
304} 328}
305 329
306static unsigned long __meminit 330static unsigned long __meminit
307phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 331phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
332 pgprot_t prot)
308{ 333{
309 unsigned pages = 0; 334 unsigned pages = 0;
310 unsigned long last_map_addr = end; 335 unsigned long last_map_addr = end;
@@ -322,32 +347,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
322 break; 347 break;
323 } 348 }
324 349
350 /*
351 * We will re-use the existing mapping.
352 * Xen for example has some special requirements, like mapping
353 * pagetable pages as RO. So assume someone who pre-setup
354 * these mappings are more intelligent.
355 */
325 if (pte_val(*pte)) 356 if (pte_val(*pte))
326 continue; 357 continue;
327 358
328 if (0) 359 if (0)
329 printk(" pte=%p addr=%lx pte=%016lx\n", 360 printk(" pte=%p addr=%lx pte=%016lx\n",
330 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 361 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
331 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
332 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
333 pages++; 362 pages++;
363 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
364 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
334 } 365 }
366
335 update_page_count(PG_LEVEL_4K, pages); 367 update_page_count(PG_LEVEL_4K, pages);
336 368
337 return last_map_addr; 369 return last_map_addr;
338} 370}
339 371
340static unsigned long __meminit 372static unsigned long __meminit
341phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 373phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
374 pgprot_t prot)
342{ 375{
343 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 376 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
344 377
345 return phys_pte_init(pte, address, end); 378 return phys_pte_init(pte, address, end, prot);
346} 379}
347 380
348static unsigned long __meminit 381static unsigned long __meminit
349phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 382phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
350 unsigned long page_size_mask) 383 unsigned long page_size_mask, pgprot_t prot)
351{ 384{
352 unsigned long pages = 0; 385 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 386 unsigned long last_map_addr = end;
@@ -358,6 +391,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
358 unsigned long pte_phys; 391 unsigned long pte_phys;
359 pmd_t *pmd = pmd_page + pmd_index(address); 392 pmd_t *pmd = pmd_page + pmd_index(address);
360 pte_t *pte; 393 pte_t *pte;
394 pgprot_t new_prot = prot;
361 395
362 if (address >= end) { 396 if (address >= end) {
363 if (!after_bootmem) { 397 if (!after_bootmem) {
@@ -368,25 +402,48 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
368 } 402 }
369 403
370 if (pmd_val(*pmd)) { 404 if (pmd_val(*pmd)) {
371 if (!pmd_large(*pmd)) 405 if (!pmd_large(*pmd)) {
406 spin_lock(&init_mm.page_table_lock);
372 last_map_addr = phys_pte_update(pmd, address, 407 last_map_addr = phys_pte_update(pmd, address,
373 end); 408 end, prot);
374 continue; 409 spin_unlock(&init_mm.page_table_lock);
410 continue;
411 }
412 /*
413 * If we are ok with PG_LEVEL_2M mapping, then we will
414 * use the existing mapping,
415 *
416 * Otherwise, we will split the large page mapping but
417 * use the same existing protection bits except for
418 * large page, so that we don't violate Intel's TLB
419 * Application note (317080) which says, while changing
420 * the page sizes, new and old translations should
421 * not differ with respect to page frame and
422 * attributes.
423 */
424 if (page_size_mask & (1 << PG_LEVEL_2M))
425 continue;
426 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
375 } 427 }
376 428
377 if (page_size_mask & (1<<PG_LEVEL_2M)) { 429 if (page_size_mask & (1<<PG_LEVEL_2M)) {
378 pages++; 430 pages++;
431 spin_lock(&init_mm.page_table_lock);
379 set_pte((pte_t *)pmd, 432 set_pte((pte_t *)pmd,
380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 433 pfn_pte(address >> PAGE_SHIFT,
434 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
435 spin_unlock(&init_mm.page_table_lock);
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 436 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
382 continue; 437 continue;
383 } 438 }
384 439
385 pte = alloc_low_page(&pte_phys); 440 pte = alloc_low_page(&pte_phys);
386 last_map_addr = phys_pte_init(pte, address, end); 441 last_map_addr = phys_pte_init(pte, address, end, new_prot);
387 unmap_low_page(pte); 442 unmap_low_page(pte);
388 443
444 spin_lock(&init_mm.page_table_lock);
389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 445 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
446 spin_unlock(&init_mm.page_table_lock);
390 } 447 }
391 update_page_count(PG_LEVEL_2M, pages); 448 update_page_count(PG_LEVEL_2M, pages);
392 return last_map_addr; 449 return last_map_addr;
@@ -394,14 +451,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
394 451
395static unsigned long __meminit 452static unsigned long __meminit
396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 453phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
397 unsigned long page_size_mask) 454 unsigned long page_size_mask, pgprot_t prot)
398{ 455{
399 pmd_t *pmd = pmd_offset(pud, 0); 456 pmd_t *pmd = pmd_offset(pud, 0);
400 unsigned long last_map_addr; 457 unsigned long last_map_addr;
401 458
402 spin_lock(&init_mm.page_table_lock); 459 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
404 spin_unlock(&init_mm.page_table_lock);
405 __flush_tlb_all(); 460 __flush_tlb_all();
406 return last_map_addr; 461 return last_map_addr;
407} 462}
@@ -418,6 +473,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
418 unsigned long pmd_phys; 473 unsigned long pmd_phys;
419 pud_t *pud = pud_page + pud_index(addr); 474 pud_t *pud = pud_page + pud_index(addr);
420 pmd_t *pmd; 475 pmd_t *pmd;
476 pgprot_t prot = PAGE_KERNEL;
421 477
422 if (addr >= end) 478 if (addr >= end)
423 break; 479 break;
@@ -429,30 +485,49 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
429 } 485 }
430 486
431 if (pud_val(*pud)) { 487 if (pud_val(*pud)) {
432 if (!pud_large(*pud)) 488 if (!pud_large(*pud)) {
433 last_map_addr = phys_pmd_update(pud, addr, end, 489 last_map_addr = phys_pmd_update(pud, addr, end,
434 page_size_mask); 490 page_size_mask, prot);
435 continue; 491 continue;
492 }
493 /*
494 * If we are ok with PG_LEVEL_1G mapping, then we will
495 * use the existing mapping.
496 *
497 * Otherwise, we will split the gbpage mapping but use
498 * the same existing protection bits except for large
499 * page, so that we don't violate Intel's TLB
500 * Application note (317080) which says, while changing
501 * the page sizes, new and old translations should
502 * not differ with respect to page frame and
503 * attributes.
504 */
505 if (page_size_mask & (1 << PG_LEVEL_1G))
506 continue;
507 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
436 } 508 }
437 509
438 if (page_size_mask & (1<<PG_LEVEL_1G)) { 510 if (page_size_mask & (1<<PG_LEVEL_1G)) {
439 pages++; 511 pages++;
512 spin_lock(&init_mm.page_table_lock);
440 set_pte((pte_t *)pud, 513 set_pte((pte_t *)pud,
441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 514 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
515 spin_unlock(&init_mm.page_table_lock);
442 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 516 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
443 continue; 517 continue;
444 } 518 }
445 519
446 pmd = alloc_low_page(&pmd_phys); 520 pmd = alloc_low_page(&pmd_phys);
521 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
522 prot);
523 unmap_low_page(pmd);
447 524
448 spin_lock(&init_mm.page_table_lock); 525 spin_lock(&init_mm.page_table_lock);
449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
450 unmap_low_page(pmd);
451 pud_populate(&init_mm, pud, __va(pmd_phys)); 526 pud_populate(&init_mm, pud, __va(pmd_phys));
452 spin_unlock(&init_mm.page_table_lock); 527 spin_unlock(&init_mm.page_table_lock);
453
454 } 528 }
455 __flush_tlb_all(); 529 __flush_tlb_all();
530
456 update_page_count(PG_LEVEL_1G, pages); 531 update_page_count(PG_LEVEL_1G, pages);
457 532
458 return last_map_addr; 533 return last_map_addr;
@@ -469,27 +544,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
469 return phys_pud_init(pud, addr, end, page_size_mask); 544 return phys_pud_init(pud, addr, end, page_size_mask);
470} 545}
471 546
472static void __init find_early_table_space(unsigned long end) 547static void __init find_early_table_space(unsigned long end, int use_pse,
548 int use_gbpages)
473{ 549{
474 unsigned long puds, pmds, ptes, tables, start; 550 unsigned long puds, pmds, ptes, tables, start;
475 551
476 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 552 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
477 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 553 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
478 if (direct_gbpages) { 554 if (use_gbpages) {
479 unsigned long extra; 555 unsigned long extra;
480 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 556 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
481 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 557 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
482 } else 558 } else
483 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 559 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
484 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 560 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
485 561
486 if (cpu_has_pse) { 562 if (use_pse) {
487 unsigned long extra; 563 unsigned long extra;
488 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 564 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
489 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 565 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
490 } else 566 } else
491 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 567 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
492 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 568 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
493 569
494 /* 570 /*
495 * RED-PEN putting page tables only on node 0 could 571 * RED-PEN putting page tables only on node 0 could
@@ -542,17 +618,16 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
542 continue; 618 continue;
543 } 619 }
544 620
545 if (after_bootmem) 621 pud = alloc_low_page(&pud_phys);
546 pud = pud_offset(pgd, start & PGDIR_MASK);
547 else
548 pud = alloc_low_page(&pud_phys);
549
550 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 622 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
551 page_size_mask); 623 page_size_mask);
552 unmap_low_page(pud); 624 unmap_low_page(pud);
553 pgd_populate(&init_mm, pgd_offset_k(start), 625
554 __va(pud_phys)); 626 spin_lock(&init_mm.page_table_lock);
627 pgd_populate(&init_mm, pgd, __va(pud_phys));
628 spin_unlock(&init_mm.page_table_lock);
555 } 629 }
630 __flush_tlb_all();
556 631
557 return last_map_addr; 632 return last_map_addr;
558} 633}
@@ -596,6 +671,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
596 671
597 struct map_range mr[NR_RANGE_MR]; 672 struct map_range mr[NR_RANGE_MR];
598 int nr_range, i; 673 int nr_range, i;
674 int use_pse, use_gbpages;
599 675
600 printk(KERN_INFO "init_memory_mapping\n"); 676 printk(KERN_INFO "init_memory_mapping\n");
601 677
@@ -609,9 +685,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
609 if (!after_bootmem) 685 if (!after_bootmem)
610 init_gbpages(); 686 init_gbpages();
611 687
612 if (direct_gbpages) 688#ifdef CONFIG_DEBUG_PAGEALLOC
689 /*
690 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
691 * This will simplify cpa(), which otherwise needs to support splitting
692 * large pages into small in interrupt context, etc.
693 */
694 use_pse = use_gbpages = 0;
695#else
696 use_pse = cpu_has_pse;
697 use_gbpages = direct_gbpages;
698#endif
699
700 if (use_gbpages)
613 page_size_mask |= 1 << PG_LEVEL_1G; 701 page_size_mask |= 1 << PG_LEVEL_1G;
614 if (cpu_has_pse) 702 if (use_pse)
615 page_size_mask |= 1 << PG_LEVEL_2M; 703 page_size_mask |= 1 << PG_LEVEL_2M;
616 704
617 memset(mr, 0, sizeof(mr)); 705 memset(mr, 0, sizeof(mr));
@@ -672,7 +760,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
672 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 760 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
673 761
674 if (!after_bootmem) 762 if (!after_bootmem)
675 find_early_table_space(end); 763 find_early_table_space(end, use_pse, use_gbpages);
676 764
677 for (i = 0; i < nr_range; i++) 765 for (i = 0; i < nr_range; i++)
678 last_map_addr = kernel_physical_mapping_init( 766 last_map_addr = kernel_physical_mapping_init(
@@ -794,6 +882,8 @@ void __init mem_init(void)
794{ 882{
795 long codesize, reservedpages, datasize, initsize; 883 long codesize, reservedpages, datasize, initsize;
796 884
885 start_periodic_check_for_corruption();
886
797 pci_iommu_alloc(); 887 pci_iommu_alloc();
798 888
799 /* clear_bss() already clear the empty_zero_page */ 889 /* clear_bss() already clear the empty_zero_page */
@@ -831,8 +921,6 @@ void __init mem_init(void)
831 reservedpages << (PAGE_SHIFT-10), 921 reservedpages << (PAGE_SHIFT-10),
832 datasize >> 10, 922 datasize >> 10,
833 initsize >> 10); 923 initsize >> 10);
834
835 cpa_init();
836} 924}
837 925
838void free_init_pages(char *what, unsigned long begin, unsigned long end) 926void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 016f335bbeea..8cbeda15cd29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,26 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27unsigned long __phys_addr(unsigned long x) 27static inline int phys_addr_valid(unsigned long addr)
28{ 28{
29 if (x >= __START_KERNEL_map) 29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30 return x - __START_KERNEL_map + phys_base;
31 return x - PAGE_OFFSET;
32} 30}
33EXPORT_SYMBOL(__phys_addr);
34 31
35static inline int phys_addr_valid(unsigned long addr) 32unsigned long __phys_addr(unsigned long x)
36{ 33{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 34 if (x >= __START_KERNEL_map) {
35 x -= __START_KERNEL_map;
36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
38} 45}
46EXPORT_SYMBOL(__phys_addr);
39 47
40#else 48#else
41 49
@@ -44,6 +52,17 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 52 return 1;
45} 53}
46 54
55#ifdef CONFIG_DEBUG_VIRTUAL
56unsigned long __phys_addr(unsigned long x)
57{
58 /* VMALLOC_* aren't constants; not available at the boot time */
59 VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING &&
60 is_vmalloc_addr((void *)x)));
61 return x - PAGE_OFFSET;
62}
63EXPORT_SYMBOL(__phys_addr);
64#endif
65
47#endif 66#endif
48 67
49int page_is_ram(unsigned long pagenr) 68int page_is_ram(unsigned long pagenr)
@@ -83,6 +102,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 102 return 0;
84} 103}
85 104
105int pagerange_is_ram(unsigned long start, unsigned long end)
106{
107 int ram_page = 0, not_rampage = 0;
108 unsigned long page_nr;
109
110 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
111 ++page_nr) {
112 if (page_is_ram(page_nr))
113 ram_page = 1;
114 else
115 not_rampage = 1;
116
117 if (ram_page == not_rampage)
118 return -1;
119 }
120
121 return ram_page;
122}
123
86/* 124/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 125 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 126 * conflicts.
@@ -170,7 +208,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 phys_addr &= PAGE_MASK; 208 phys_addr &= PAGE_MASK;
171 size = PAGE_ALIGN(last_addr+1) - phys_addr; 209 size = PAGE_ALIGN(last_addr+1) - phys_addr;
172 210
173 retval = reserve_memtype(phys_addr, phys_addr + size, 211 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
174 prot_val, &new_prot_val); 212 prot_val, &new_prot_val);
175 if (retval) { 213 if (retval) {
176 pr_debug("Warning: reserve_memtype returned %d\n", retval); 214 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -421,7 +459,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
421 return; 459 return;
422} 460}
423 461
424int __initdata early_ioremap_debug; 462static int __initdata early_ioremap_debug;
425 463
426static int __init early_ioremap_debug_setup(char *str) 464static int __init early_ioremap_debug_setup(char *str)
427{ 465{
@@ -547,19 +585,17 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
547} 585}
548 586
549 587
550int __initdata early_ioremap_nested; 588static int __initdata early_ioremap_nested;
551 589
552static int __init check_early_ioremap_leak(void) 590static int __init check_early_ioremap_leak(void)
553{ 591{
554 if (!early_ioremap_nested) 592 if (!early_ioremap_nested)
555 return 0; 593 return 0;
556 594 WARN(1, KERN_WARNING
557 printk(KERN_WARNING
558 "Debug warning: early ioremap leak of %d areas detected.\n", 595 "Debug warning: early ioremap leak of %d areas detected.\n",
559 early_ioremap_nested); 596 early_ioremap_nested);
560 printk(KERN_WARNING 597 printk(KERN_WARNING
561 "please boot with early_ioremap_debug and report the dmesg.\n"); 598 "please boot with early_ioremap_debug and report the dmesg.\n");
562 WARN_ON(1);
563 599
564 return 1; 600 return 1;
565} 601}
@@ -597,7 +633,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
597 */ 633 */
598 offset = phys_addr & ~PAGE_MASK; 634 offset = phys_addr & ~PAGE_MASK;
599 phys_addr &= PAGE_MASK; 635 phys_addr &= PAGE_MASK;
600 size = PAGE_ALIGN(last_addr) - phys_addr; 636 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
601 637
602 /* 638 /*
603 * Mappings have to fit in the FIX_BTMAP area. 639 * Mappings have to fit in the FIX_BTMAP area.
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index e7397e108beb..635b50e85581 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -430,7 +430,9 @@ static void enter_uniprocessor(void)
430 "may miss events.\n"); 430 "may miss events.\n");
431} 431}
432 432
433static void leave_uniprocessor(void) 433/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
434 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
435static void __ref leave_uniprocessor(void)
434{ 436{
435 int cpu; 437 int cpu;
436 int err; 438 int err;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d6003..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
79 return 0; 79 return 0;
80 80
81 addr = 0x8000; 81 addr = 0x8000;
82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
84 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
85 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
177 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
178 void *bootmap; 178 void *bootmap;
179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
180 int nid; 180 int nid;
181 181
182 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
183 183
184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
185 start, end); 185 start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
211 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
212 if (nid == nodeid) 212 if (nid == nodeid)
213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
214 else 214 else
215 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
216 /* 216 /*
217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e6..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
@@ -221,8 +224,7 @@ static int pageattr_test(void)
221 failed += print_split(&sc); 224 failed += print_split(&sc);
222 225
223 if (failed) { 226 if (failed) {
224 printk(KERN_ERR "NOT PASSED. Please report.\n"); 227 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
225 WARN_ON(1);
226 return -EINVAL; 228 return -EINVAL;
227 } else { 229 } else {
228 if (print) 230 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf059..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -55,13 +67,19 @@ static void split_page_count(int level)
55 67
56int arch_report_meminfo(char *page) 68int arch_report_meminfo(char *page)
57{ 69{
58 int n = sprintf(page, "DirectMap4k: %8lu\n" 70 int n = sprintf(page, "DirectMap4k: %8lu kB\n",
59 "DirectMap2M: %8lu\n", 71 direct_pages_count[PG_LEVEL_4K] << 2);
60 direct_pages_count[PG_LEVEL_4K], 72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 direct_pages_count[PG_LEVEL_2M]); 73 n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
74 direct_pages_count[PG_LEVEL_2M] << 11);
75#else
76 n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
77 direct_pages_count[PG_LEVEL_2M] << 12);
78#endif
62#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n", 80 if (direct_gbpages)
64 direct_pages_count[PG_LEVEL_1G]); 81 n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_1G] << 20);
65#endif 83#endif
66 return n; 84 return n;
67} 85}
@@ -78,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
78 96
79static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
80{ 98{
81 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
82} 100}
83 101
84#endif 102#endif
@@ -184,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
184 } 202 }
185} 203}
186 204
205static void cpa_flush_array(unsigned long *start, int numpages, int cache)
206{
207 unsigned int i, level;
208 unsigned long *addr;
209
210 BUG_ON(irqs_disabled());
211
212 on_each_cpu(__cpa_flush_range, NULL, 1);
213
214 if (!cache)
215 return;
216
217 /* 4M threshold */
218 if (numpages >= 1024) {
219 if (boot_cpu_data.x86_model >= 4)
220 wbinvd();
221 return;
222 }
223 /*
224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same
227 * cachelines:
228 */
229 for (i = 0, addr = start; i < numpages; i++, addr++) {
230 pte_t *pte = lookup_address(*addr, &level);
231
232 /*
233 * Only flush present addresses:
234 */
235 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
236 clflush_cache_range((void *) *addr, PAGE_SIZE);
237 }
238}
239
187/* 240/*
188 * Certain areas of memory on x86 require very specific protection flags, 241 * Certain areas of memory on x86 require very specific protection flags,
189 * for example the BIOS area or kernel text. Callers don't always get this 242 * for example the BIOS area or kernel text. Callers don't always get this
@@ -392,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
392 */ 445 */
393 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 446 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
394 __set_pmd_pte(kpte, address, new_pte); 447 __set_pmd_pte(kpte, address, new_pte);
395 cpa->flushtlb = 1; 448 cpa->flags |= CPA_FLUSHTLB;
396 do_split = 0; 449 do_split = 0;
397 } 450 }
398 451
@@ -402,84 +455,6 @@ out_unlock:
402 return do_split; 455 return do_split;
403} 456}
404 457
405static LIST_HEAD(page_pool);
406static unsigned long pool_size, pool_pages, pool_low;
407static unsigned long pool_used, pool_failed;
408
409static void cpa_fill_pool(struct page **ret)
410{
411 gfp_t gfp = GFP_KERNEL;
412 unsigned long flags;
413 struct page *p;
414
415 /*
416 * Avoid recursion (on debug-pagealloc) and also signal
417 * our priority to get to these pagetables:
418 */
419 if (current->flags & PF_MEMALLOC)
420 return;
421 current->flags |= PF_MEMALLOC;
422
423 /*
424 * Allocate atomically from atomic contexts:
425 */
426 if (in_atomic() || irqs_disabled() || debug_pagealloc)
427 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
428
429 while (pool_pages < pool_size || (ret && !*ret)) {
430 p = alloc_pages(gfp, 0);
431 if (!p) {
432 pool_failed++;
433 break;
434 }
435 /*
436 * If the call site needs a page right now, provide it:
437 */
438 if (ret && !*ret) {
439 *ret = p;
440 continue;
441 }
442 spin_lock_irqsave(&pgd_lock, flags);
443 list_add(&p->lru, &page_pool);
444 pool_pages++;
445 spin_unlock_irqrestore(&pgd_lock, flags);
446 }
447
448 current->flags &= ~PF_MEMALLOC;
449}
450
451#define SHIFT_MB (20 - PAGE_SHIFT)
452#define ROUND_MB_GB ((1 << 10) - 1)
453#define SHIFT_MB_GB 10
454#define POOL_PAGES_PER_GB 16
455
456void __init cpa_init(void)
457{
458 struct sysinfo si;
459 unsigned long gb;
460
461 si_meminfo(&si);
462 /*
463 * Calculate the number of pool pages:
464 *
465 * Convert totalram (nr of pages) to MiB and round to the next
466 * GiB. Shift MiB to Gib and multiply the result by
467 * POOL_PAGES_PER_GB:
468 */
469 if (debug_pagealloc) {
470 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
471 pool_size = POOL_PAGES_PER_GB * gb;
472 } else {
473 pool_size = 1;
474 }
475 pool_low = pool_size;
476
477 cpa_fill_pool(NULL);
478 printk(KERN_DEBUG
479 "CPA: page pool initialized %lu of %lu pages preallocated\n",
480 pool_pages, pool_size);
481}
482
483static int split_large_page(pte_t *kpte, unsigned long address) 458static int split_large_page(pte_t *kpte, unsigned long address)
484{ 459{
485 unsigned long flags, pfn, pfninc = 1; 460 unsigned long flags, pfn, pfninc = 1;
@@ -488,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
488 pgprot_t ref_prot; 463 pgprot_t ref_prot;
489 struct page *base; 464 struct page *base;
490 465
491 /* 466 if (!debug_pagealloc)
492 * Get a page from the pool. The pool list is protected by the 467 spin_unlock(&cpa_lock);
493 * pgd_lock, which we have to take anyway for the split 468 base = alloc_pages(GFP_KERNEL, 0);
494 * operation: 469 if (!debug_pagealloc)
495 */ 470 spin_lock(&cpa_lock);
496 spin_lock_irqsave(&pgd_lock, flags); 471 if (!base)
497 if (list_empty(&page_pool)) { 472 return -ENOMEM;
498 spin_unlock_irqrestore(&pgd_lock, flags);
499 base = NULL;
500 cpa_fill_pool(&base);
501 if (!base)
502 return -ENOMEM;
503 spin_lock_irqsave(&pgd_lock, flags);
504 } else {
505 base = list_first_entry(&page_pool, struct page, lru);
506 list_del(&base->lru);
507 pool_pages--;
508
509 if (pool_pages < pool_low)
510 pool_low = pool_pages;
511 }
512 473
474 spin_lock_irqsave(&pgd_lock, flags);
513 /* 475 /*
514 * Check for races, another CPU might have split this page 476 * Check for races, another CPU might have split this page
515 * up for us already: 477 * up for us already:
@@ -566,11 +528,8 @@ out_unlock:
566 * If we dropped out via the lookup_address check under 528 * If we dropped out via the lookup_address check under
567 * pgd_lock then stick the page back into the pool: 529 * pgd_lock then stick the page back into the pool:
568 */ 530 */
569 if (base) { 531 if (base)
570 list_add(&base->lru, &page_pool); 532 __free_page(base);
571 pool_pages++;
572 } else
573 pool_used++;
574 spin_unlock_irqrestore(&pgd_lock, flags); 533 spin_unlock_irqrestore(&pgd_lock, flags);
575 534
576 return 0; 535 return 0;
@@ -578,11 +537,16 @@ out_unlock:
578 537
579static int __change_page_attr(struct cpa_data *cpa, int primary) 538static int __change_page_attr(struct cpa_data *cpa, int primary)
580{ 539{
581 unsigned long address = cpa->vaddr; 540 unsigned long address;
582 int do_split, err; 541 int do_split, err;
583 unsigned int level; 542 unsigned int level;
584 pte_t *kpte, old_pte; 543 pte_t *kpte, old_pte;
585 544
545 if (cpa->flags & CPA_ARRAY)
546 address = cpa->vaddr[cpa->curpage];
547 else
548 address = *cpa->vaddr;
549
586repeat: 550repeat:
587 kpte = lookup_address(address, &level); 551 kpte = lookup_address(address, &level);
588 if (!kpte) 552 if (!kpte)
@@ -592,10 +556,9 @@ repeat:
592 if (!pte_val(old_pte)) { 556 if (!pte_val(old_pte)) {
593 if (!primary) 557 if (!primary)
594 return 0; 558 return 0;
595 printk(KERN_WARNING "CPA: called for zero pte. " 559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
596 "vaddr = %lx cpa->vaddr = %lx\n", address, 560 "vaddr = %lx cpa->vaddr = %lx\n", address,
597 cpa->vaddr); 561 *cpa->vaddr);
598 WARN_ON(1);
599 return -EINVAL; 562 return -EINVAL;
600 } 563 }
601 564
@@ -621,7 +584,7 @@ repeat:
621 */ 584 */
622 if (pte_val(old_pte) != pte_val(new_pte)) { 585 if (pte_val(old_pte) != pte_val(new_pte)) {
623 set_pte_atomic(kpte, new_pte); 586 set_pte_atomic(kpte, new_pte);
624 cpa->flushtlb = 1; 587 cpa->flags |= CPA_FLUSHTLB;
625 } 588 }
626 cpa->numpages = 1; 589 cpa->numpages = 1;
627 return 0; 590 return 0;
@@ -645,7 +608,25 @@ repeat:
645 */ 608 */
646 err = split_large_page(kpte, address); 609 err = split_large_page(kpte, address);
647 if (!err) { 610 if (!err) {
648 cpa->flushtlb = 1; 611 /*
612 * Do a global flush tlb after splitting the large page
613 * and before we do the actual change page attribute in the PTE.
614 *
615 * With out this, we violate the TLB application note, that says
616 * "The TLBs may contain both ordinary and large-page
617 * translations for a 4-KByte range of linear addresses. This
618 * may occur if software modifies the paging structures so that
619 * the page size used for the address range changes. If the two
620 * translations differ with respect to page frame or attributes
621 * (e.g., permissions), processor behavior is undefined and may
622 * be implementation-specific."
623 *
624 * We do this global tlb flush inside the cpa_lock, so that we
625 * don't allow any other cpu, with stale tlb entries change the
626 * page attribute in parallel, that also falls into the
627 * just split large page entry.
628 */
629 flush_tlb_all();
649 goto repeat; 630 goto repeat;
650 } 631 }
651 632
@@ -658,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
658{ 639{
659 struct cpa_data alias_cpa; 640 struct cpa_data alias_cpa;
660 int ret = 0; 641 int ret = 0;
642 unsigned long temp_cpa_vaddr, vaddr;
661 643
662 if (cpa->pfn >= max_pfn_mapped) 644 if (cpa->pfn >= max_pfn_mapped)
663 return 0; 645 return 0;
@@ -670,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
670 * No need to redo, when the primary call touched the direct 652 * No need to redo, when the primary call touched the direct
671 * mapping already: 653 * mapping already:
672 */ 654 */
673 if (!(within(cpa->vaddr, PAGE_OFFSET, 655 if (cpa->flags & CPA_ARRAY)
656 vaddr = cpa->vaddr[cpa->curpage];
657 else
658 vaddr = *cpa->vaddr;
659
660 if (!(within(vaddr, PAGE_OFFSET,
674 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 661 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
675#ifdef CONFIG_X86_64 662#ifdef CONFIG_X86_64
676 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 663 || within(vaddr, PAGE_OFFSET + (1UL<<32),
677 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 664 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
678#endif 665#endif
679 )) { 666 )) {
680 667
681 alias_cpa = *cpa; 668 alias_cpa = *cpa;
682 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 669 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
670 alias_cpa.vaddr = &temp_cpa_vaddr;
671 alias_cpa.flags &= ~CPA_ARRAY;
672
683 673
684 ret = __change_page_attr_set_clr(&alias_cpa, 0); 674 ret = __change_page_attr_set_clr(&alias_cpa, 0);
685 } 675 }
@@ -691,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
691 * No need to redo, when the primary call touched the high 681 * No need to redo, when the primary call touched the high
692 * mapping already: 682 * mapping already:
693 */ 683 */
694 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 684 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
695 return 0; 685 return 0;
696 686
697 /* 687 /*
@@ -702,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
702 return 0; 692 return 0;
703 693
704 alias_cpa = *cpa; 694 alias_cpa = *cpa;
705 alias_cpa.vaddr = 695 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
706 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 696 alias_cpa.vaddr = &temp_cpa_vaddr;
697 alias_cpa.flags &= ~CPA_ARRAY;
707 698
708 /* 699 /*
709 * The high mapping range is imprecise, so ignore the return value. 700 * The high mapping range is imprecise, so ignore the return value.
@@ -723,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
723 * preservation check. 714 * preservation check.
724 */ 715 */
725 cpa->numpages = numpages; 716 cpa->numpages = numpages;
717 /* for array changes, we can't use large page */
718 if (cpa->flags & CPA_ARRAY)
719 cpa->numpages = 1;
726 720
721 if (!debug_pagealloc)
722 spin_lock(&cpa_lock);
727 ret = __change_page_attr(cpa, checkalias); 723 ret = __change_page_attr(cpa, checkalias);
724 if (!debug_pagealloc)
725 spin_unlock(&cpa_lock);
728 if (ret) 726 if (ret)
729 return ret; 727 return ret;
730 728
@@ -741,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
741 */ 739 */
742 BUG_ON(cpa->numpages > numpages); 740 BUG_ON(cpa->numpages > numpages);
743 numpages -= cpa->numpages; 741 numpages -= cpa->numpages;
744 cpa->vaddr += cpa->numpages * PAGE_SIZE; 742 if (cpa->flags & CPA_ARRAY)
743 cpa->curpage++;
744 else
745 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
746
745 } 747 }
746 return 0; 748 return 0;
747} 749}
@@ -752,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
752 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 754 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
753} 755}
754 756
755static int change_page_attr_set_clr(unsigned long addr, int numpages, 757static int change_page_attr_set_clr(unsigned long *addr, int numpages,
756 pgprot_t mask_set, pgprot_t mask_clr, 758 pgprot_t mask_set, pgprot_t mask_clr,
757 int force_split) 759 int force_split, int array)
758{ 760{
759 struct cpa_data cpa; 761 struct cpa_data cpa;
760 int ret, cache, checkalias; 762 int ret, cache, checkalias;
@@ -769,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
769 return 0; 771 return 0;
770 772
771 /* Ensure we are PAGE_SIZE aligned */ 773 /* Ensure we are PAGE_SIZE aligned */
772 if (addr & ~PAGE_MASK) { 774 if (!array) {
773 addr &= PAGE_MASK; 775 if (*addr & ~PAGE_MASK) {
774 /* 776 *addr &= PAGE_MASK;
775 * People should not be passing in unaligned addresses: 777 /*
776 */ 778 * People should not be passing in unaligned addresses:
777 WARN_ON_ONCE(1); 779 */
780 WARN_ON_ONCE(1);
781 }
782 } else {
783 int i;
784 for (i = 0; i < numpages; i++) {
785 if (addr[i] & ~PAGE_MASK) {
786 addr[i] &= PAGE_MASK;
787 WARN_ON_ONCE(1);
788 }
789 }
778 } 790 }
779 791
792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused();
794
780 cpa.vaddr = addr; 795 cpa.vaddr = addr;
781 cpa.numpages = numpages; 796 cpa.numpages = numpages;
782 cpa.mask_set = mask_set; 797 cpa.mask_set = mask_set;
783 cpa.mask_clr = mask_clr; 798 cpa.mask_clr = mask_clr;
784 cpa.flushtlb = 0; 799 cpa.flags = 0;
800 cpa.curpage = 0;
785 cpa.force_split = force_split; 801 cpa.force_split = force_split;
786 802
803 if (array)
804 cpa.flags |= CPA_ARRAY;
805
787 /* No alias checking for _NX bit modifications */ 806 /* No alias checking for _NX bit modifications */
788 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 807 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
789 808
@@ -792,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
792 /* 811 /*
793 * Check whether we really changed something: 812 * Check whether we really changed something:
794 */ 813 */
795 if (!cpa.flushtlb) 814 if (!(cpa.flags & CPA_FLUSHTLB))
796 goto out; 815 goto out;
797 816
798 /* 817 /*
@@ -807,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
807 * error case we fall back to cpa_flush_all (which uses 826 * error case we fall back to cpa_flush_all (which uses
808 * wbindv): 827 * wbindv):
809 */ 828 */
810 if (!ret && cpu_has_clflush) 829 if (!ret && cpu_has_clflush) {
811 cpa_flush_range(addr, numpages, cache); 830 if (cpa.flags & CPA_ARRAY)
812 else 831 cpa_flush_array(addr, numpages, cache);
832 else
833 cpa_flush_range(*addr, numpages, cache);
834 } else
813 cpa_flush_all(cache); 835 cpa_flush_all(cache);
814 836
815out: 837out:
816 cpa_fill_pool(NULL);
817
818 return ret; 838 return ret;
819} 839}
820 840
821static inline int change_page_attr_set(unsigned long addr, int numpages, 841static inline int change_page_attr_set(unsigned long *addr, int numpages,
822 pgprot_t mask) 842 pgprot_t mask, int array)
823{ 843{
824 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 844 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
845 array);
825} 846}
826 847
827static inline int change_page_attr_clear(unsigned long addr, int numpages, 848static inline int change_page_attr_clear(unsigned long *addr, int numpages,
828 pgprot_t mask) 849 pgprot_t mask, int array)
829{ 850{
830 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 851 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
852 array);
831} 853}
832 854
833int _set_memory_uc(unsigned long addr, int numpages) 855int _set_memory_uc(unsigned long addr, int numpages)
@@ -835,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
835 /* 857 /*
836 * for now UC MINUS. see comments in ioremap_nocache() 858 * for now UC MINUS. see comments in ioremap_nocache()
837 */ 859 */
838 return change_page_attr_set(addr, numpages, 860 return change_page_attr_set(&addr, numpages,
839 __pgprot(_PAGE_CACHE_UC_MINUS)); 861 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
840} 862}
841 863
842int set_memory_uc(unsigned long addr, int numpages) 864int set_memory_uc(unsigned long addr, int numpages)
@@ -844,7 +866,7 @@ int set_memory_uc(unsigned long addr, int numpages)
844 /* 866 /*
845 * for now UC MINUS. see comments in ioremap_nocache() 867 * for now UC MINUS. see comments in ioremap_nocache()
846 */ 868 */
847 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 869 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
848 _PAGE_CACHE_UC_MINUS, NULL)) 870 _PAGE_CACHE_UC_MINUS, NULL))
849 return -EINVAL; 871 return -EINVAL;
850 872
@@ -852,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages)
852} 874}
853EXPORT_SYMBOL(set_memory_uc); 875EXPORT_SYMBOL(set_memory_uc);
854 876
877int set_memory_array_uc(unsigned long *addr, int addrinarray)
878{
879 unsigned long start;
880 unsigned long end;
881 int i;
882 /*
883 * for now UC MINUS. see comments in ioremap_nocache()
884 */
885 for (i = 0; i < addrinarray; i++) {
886 start = __pa(addr[i]);
887 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
888 if (end != __pa(addr[i + 1]))
889 break;
890 i++;
891 }
892 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
893 goto out;
894 }
895
896 return change_page_attr_set(addr, addrinarray,
897 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
898out:
899 for (i = 0; i < addrinarray; i++) {
900 unsigned long tmp = __pa(addr[i]);
901
902 if (tmp == start)
903 break;
904 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
905 if (end != __pa(addr[i + 1]))
906 break;
907 i++;
908 }
909 free_memtype(tmp, end);
910 }
911 return -EINVAL;
912}
913EXPORT_SYMBOL(set_memory_array_uc);
914
855int _set_memory_wc(unsigned long addr, int numpages) 915int _set_memory_wc(unsigned long addr, int numpages)
856{ 916{
857 return change_page_attr_set(addr, numpages, 917 return change_page_attr_set(&addr, numpages,
858 __pgprot(_PAGE_CACHE_WC)); 918 __pgprot(_PAGE_CACHE_WC), 0);
859} 919}
860 920
861int set_memory_wc(unsigned long addr, int numpages) 921int set_memory_wc(unsigned long addr, int numpages)
@@ -863,7 +923,7 @@ int set_memory_wc(unsigned long addr, int numpages)
863 if (!pat_enabled) 923 if (!pat_enabled)
864 return set_memory_uc(addr, numpages); 924 return set_memory_uc(addr, numpages);
865 925
866 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 926 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
867 _PAGE_CACHE_WC, NULL)) 927 _PAGE_CACHE_WC, NULL))
868 return -EINVAL; 928 return -EINVAL;
869 929
@@ -873,49 +933,71 @@ EXPORT_SYMBOL(set_memory_wc);
873 933
874int _set_memory_wb(unsigned long addr, int numpages) 934int _set_memory_wb(unsigned long addr, int numpages)
875{ 935{
876 return change_page_attr_clear(addr, numpages, 936 return change_page_attr_clear(&addr, numpages,
877 __pgprot(_PAGE_CACHE_MASK)); 937 __pgprot(_PAGE_CACHE_MASK), 0);
878} 938}
879 939
880int set_memory_wb(unsigned long addr, int numpages) 940int set_memory_wb(unsigned long addr, int numpages)
881{ 941{
882 free_memtype(addr, addr + numpages * PAGE_SIZE); 942 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
883 943
884 return _set_memory_wb(addr, numpages); 944 return _set_memory_wb(addr, numpages);
885} 945}
886EXPORT_SYMBOL(set_memory_wb); 946EXPORT_SYMBOL(set_memory_wb);
887 947
948int set_memory_array_wb(unsigned long *addr, int addrinarray)
949{
950 int i;
951
952 for (i = 0; i < addrinarray; i++) {
953 unsigned long start = __pa(addr[i]);
954 unsigned long end;
955
956 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
957 if (end != __pa(addr[i + 1]))
958 break;
959 i++;
960 }
961 free_memtype(start, end);
962 }
963 return change_page_attr_clear(addr, addrinarray,
964 __pgprot(_PAGE_CACHE_MASK), 1);
965}
966EXPORT_SYMBOL(set_memory_array_wb);
967
888int set_memory_x(unsigned long addr, int numpages) 968int set_memory_x(unsigned long addr, int numpages)
889{ 969{
890 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 970 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
891} 971}
892EXPORT_SYMBOL(set_memory_x); 972EXPORT_SYMBOL(set_memory_x);
893 973
894int set_memory_nx(unsigned long addr, int numpages) 974int set_memory_nx(unsigned long addr, int numpages)
895{ 975{
896 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 976 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
897} 977}
898EXPORT_SYMBOL(set_memory_nx); 978EXPORT_SYMBOL(set_memory_nx);
899 979
900int set_memory_ro(unsigned long addr, int numpages) 980int set_memory_ro(unsigned long addr, int numpages)
901{ 981{
902 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 982 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
903} 983}
984EXPORT_SYMBOL_GPL(set_memory_ro);
904 985
905int set_memory_rw(unsigned long addr, int numpages) 986int set_memory_rw(unsigned long addr, int numpages)
906{ 987{
907 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 988 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 989}
990EXPORT_SYMBOL_GPL(set_memory_rw);
909 991
910int set_memory_np(unsigned long addr, int numpages) 992int set_memory_np(unsigned long addr, int numpages)
911{ 993{
912 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 994 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
913} 995}
914 996
915int set_memory_4k(unsigned long addr, int numpages) 997int set_memory_4k(unsigned long addr, int numpages)
916{ 998{
917 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 999 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
918 __pgprot(0), 1); 1000 __pgprot(0), 1, 0);
919} 1001}
920 1002
921int set_pages_uc(struct page *page, int numpages) 1003int set_pages_uc(struct page *page, int numpages)
@@ -968,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
968 1050
969static int __set_pages_p(struct page *page, int numpages) 1051static int __set_pages_p(struct page *page, int numpages)
970{ 1052{
971 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1053 unsigned long tempaddr = (unsigned long) page_address(page);
1054 struct cpa_data cpa = { .vaddr = &tempaddr,
972 .numpages = numpages, 1055 .numpages = numpages,
973 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1056 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
974 .mask_clr = __pgprot(0)}; 1057 .mask_clr = __pgprot(0),
1058 .flags = 0};
975 1059
976 return __change_page_attr_set_clr(&cpa, 1); 1060 /*
1061 * No alias checking needed for setting present flag. otherwise,
1062 * we may need to break large pages for 64-bit kernel text
1063 * mappings (this adds to complexity if we want to do this from
1064 * atomic context especially). Let's keep it simple!
1065 */
1066 return __change_page_attr_set_clr(&cpa, 0);
977} 1067}
978 1068
979static int __set_pages_np(struct page *page, int numpages) 1069static int __set_pages_np(struct page *page, int numpages)
980{ 1070{
981 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1071 unsigned long tempaddr = (unsigned long) page_address(page);
1072 struct cpa_data cpa = { .vaddr = &tempaddr,
982 .numpages = numpages, 1073 .numpages = numpages,
983 .mask_set = __pgprot(0), 1074 .mask_set = __pgprot(0),
984 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1075 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1076 .flags = 0};
985 1077
986 return __change_page_attr_set_clr(&cpa, 1); 1078 /*
1079 * No alias checking needed for setting not present flag. otherwise,
1080 * we may need to break large pages for 64-bit kernel text
1081 * mappings (this adds to complexity if we want to do this from
1082 * atomic context especially). Let's keep it simple!
1083 */
1084 return __change_page_attr_set_clr(&cpa, 0);
987} 1085}
988 1086
989void kernel_map_pages(struct page *page, int numpages, int enable) 1087void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1003,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1003 1101
1004 /* 1102 /*
1005 * The return value is ignored as the calls cannot fail. 1103 * The return value is ignored as the calls cannot fail.
1006 * Large pages are kept enabled at boot time, and are 1104 * Large pages for identity mappings are not used at boot time
1007 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1105 * and hence no memory allocations during large page split.
1008 * fails here (due to temporary memory shortage) no damage
1009 * is done because we just keep the largepage intact up
1010 * to the next attempt when it will likely be split up:
1011 */ 1106 */
1012 if (enable) 1107 if (enable)
1013 __set_pages_p(page, numpages); 1108 __set_pages_p(page, numpages);
@@ -1019,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1019 * but that can deadlock->flush only current cpu: 1114 * but that can deadlock->flush only current cpu:
1020 */ 1115 */
1021 __flush_tlb_all(); 1116 __flush_tlb_all();
1022
1023 /*
1024 * Try to refill the page pool here. We can do this only after
1025 * the tlb flush.
1026 */
1027 cpa_fill_pool(NULL);
1028} 1117}
1029 1118
1030#ifdef CONFIG_DEBUG_FS
1031static int dpa_show(struct seq_file *m, void *v)
1032{
1033 seq_puts(m, "DEBUG_PAGEALLOC\n");
1034 seq_printf(m, "pool_size : %lu\n", pool_size);
1035 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1036 seq_printf(m, "pool_low : %lu\n", pool_low);
1037 seq_printf(m, "pool_used : %lu\n", pool_used);
1038 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1039
1040 return 0;
1041}
1042
1043static int dpa_open(struct inode *inode, struct file *filp)
1044{
1045 return single_open(filp, dpa_show, NULL);
1046}
1047
1048static const struct file_operations dpa_fops = {
1049 .open = dpa_open,
1050 .read = seq_read,
1051 .llseek = seq_lseek,
1052 .release = single_release,
1053};
1054
1055static int __init debug_pagealloc_proc_init(void)
1056{
1057 struct dentry *de;
1058
1059 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1060 &dpa_fops);
1061 if (!de)
1062 return -ENOMEM;
1063
1064 return 0;
1065}
1066__initcall(debug_pagealloc_proc_init);
1067#endif
1068
1069#ifdef CONFIG_HIBERNATION 1119#ifdef CONFIG_HIBERNATION
1070 1120
1071bool kernel_page_present(struct page *page) 1121bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2fe30916d4b6..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
17 17
18#include <asm/msr.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/page.h> 20#include <asm/tlbflush.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23#include <asm/pat.h>
24#include <asm/e820.h>
25#include <asm/cacheflush.h>
26#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
27#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
28#include <asm/io.h> 28#include <asm/io.h>
29 29
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
46 46
47 47
48static int debug_enable; 48static int debug_enable;
49
49static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
50{ 51{
51 debug_enable = 1; 52 debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
145 */ 146 */
146 147
147struct memtype { 148struct memtype {
148 u64 start; 149 u64 start;
149 u64 end; 150 u64 end;
150 unsigned long type; 151 unsigned long type;
151 struct list_head nd; 152 struct list_head nd;
152}; 153};
153 154
154static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
155static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
156 157
157/* 158/*
158 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
180 return req_type; 181 return req_type;
181} 182}
182 183
183static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
184 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185{ 186{
186 if (new->type != entry->type) { 187 if (new->type != entry->type) {
187 if (type) { 188 if (type) {
@@ -207,6 +208,69 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
207 return -EBUSY; 208 return -EBUSY;
208} 209}
209 210
211static struct memtype *cached_entry;
212static u64 cached_start;
213
214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
210/* 274/*
211 * req_type typically has one of the: 275 * req_type typically has one of the:
212 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
@@ -223,14 +287,15 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
223 * it will return a negative return value. 287 * it will return a negative return value.
224 */ 288 */
225int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
226 unsigned long *new_type) 290 unsigned long *new_type)
227{ 291{
228 struct memtype *new, *entry; 292 struct memtype *new, *entry;
229 unsigned long actual_type; 293 unsigned long actual_type;
230 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
231 int err = 0; 296 int err = 0;
232 297
233 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
234 299
235 if (!pat_enabled) { 300 if (!pat_enabled) {
236 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -263,28 +328,41 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
263 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
264 else 329 else
265 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
266 } else 331 } else {
267 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
268 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
269 341
270 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
271 if (!new) 343 if (!new)
272 return -ENOMEM; 344 return -ENOMEM;
273 345
274 new->start = start; 346 new->start = start;
275 new->end = end; 347 new->end = end;
276 new->type = actual_type; 348 new->type = actual_type;
277 349
278 if (new_type) 350 if (new_type)
279 *new_type = actual_type; 351 *new_type = actual_type;
280 352
281 spin_lock(&memtype_lock); 353 spin_lock(&memtype_lock);
282 354
355 if (cached_entry && start >= cached_start)
356 entry = cached_entry;
357 else
358 entry = list_entry(&memtype_list, struct memtype, nd);
359
283 /* Search for existing mapping that overlaps the current range */ 360 /* Search for existing mapping that overlaps the current range */
284 where = NULL; 361 where = NULL;
285 list_for_each_entry(entry, &memtype_list, nd) { 362 list_for_each_entry_continue(entry, &memtype_list, nd) {
286 if (end <= entry->start) { 363 if (end <= entry->start) {
287 where = entry->nd.prev; 364 where = entry->nd.prev;
365 cached_entry = list_entry(where, struct memtype, nd);
288 break; 366 break;
289 } else if (start <= entry->start) { /* end > entry->start */ 367 } else if (start <= entry->start) { /* end > entry->start */
290 err = chk_conflict(new, entry, new_type); 368 err = chk_conflict(new, entry, new_type);
@@ -292,6 +370,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
292 dprintk("Overlap at 0x%Lx-0x%Lx\n", 370 dprintk("Overlap at 0x%Lx-0x%Lx\n",
293 entry->start, entry->end); 371 entry->start, entry->end);
294 where = entry->nd.prev; 372 where = entry->nd.prev;
373 cached_entry = list_entry(where,
374 struct memtype, nd);
295 } 375 }
296 break; 376 break;
297 } else if (start < entry->end) { /* start > entry->start */ 377 } else if (start < entry->end) { /* start > entry->start */
@@ -299,7 +379,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
299 if (!err) { 379 if (!err) {
300 dprintk("Overlap at 0x%Lx-0x%Lx\n", 380 dprintk("Overlap at 0x%Lx-0x%Lx\n",
301 entry->start, entry->end); 381 entry->start, entry->end);
302 where = &entry->nd; 382 cached_entry = list_entry(entry->nd.prev,
383 struct memtype, nd);
384
385 /*
386 * Move to right position in the linked
387 * list to add this new entry
388 */
389 list_for_each_entry_continue(entry,
390 &memtype_list, nd) {
391 if (start <= entry->start) {
392 where = entry->nd.prev;
393 break;
394 }
395 }
303 } 396 }
304 break; 397 break;
305 } 398 }
@@ -311,9 +404,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
311 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
312 kfree(new); 405 kfree(new);
313 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
314 return err; 408 return err;
315 } 409 }
316 410
411 cached_start = start;
412
317 if (where) 413 if (where)
318 list_add(&new->nd, where); 414 list_add(&new->nd, where);
319 else 415 else
@@ -332,6 +428,7 @@ int free_memtype(u64 start, u64 end)
332{ 428{
333 struct memtype *entry; 429 struct memtype *entry;
334 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
335 432
336 if (!pat_enabled) 433 if (!pat_enabled)
337 return 0; 434 return 0;
@@ -340,9 +437,18 @@ int free_memtype(u64 start, u64 end)
340 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
341 return 0; 438 return 0;
342 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
343 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
344 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
345 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
449 if (cached_entry == entry || cached_start == start)
450 cached_entry = NULL;
451
346 list_del(&entry->nd); 452 list_del(&entry->nd);
347 kfree(entry); 453 kfree(entry);
348 err = 0; 454 err = 0;
@@ -357,18 +463,11 @@ int free_memtype(u64 start, u64 end)
357 } 463 }
358 464
359 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
360 return err; 467 return err;
361} 468}
362 469
363 470
364/*
365 * /dev/mem mmap interface. The memtype used for mapping varies:
366 * - Use UC for mappings with O_SYNC flag
367 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
368 * inherit the memtype from existing mapping.
369 * - Else use UC_MINUS memtype (for backward compatibility with existing
370 * X drivers.
371 */
372pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 471pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
373 unsigned long size, pgprot_t vma_prot) 472 unsigned long size, pgprot_t vma_prot)
374{ 473{
@@ -406,14 +505,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
406 unsigned long size, pgprot_t *vma_prot) 505 unsigned long size, pgprot_t *vma_prot)
407{ 506{
408 u64 offset = ((u64) pfn) << PAGE_SHIFT; 507 u64 offset = ((u64) pfn) << PAGE_SHIFT;
409 unsigned long flags = _PAGE_CACHE_UC_MINUS; 508 unsigned long flags = -1;
410 int retval; 509 int retval;
411 510
412 if (!range_is_allowed(pfn, size)) 511 if (!range_is_allowed(pfn, size))
413 return 0; 512 return 0;
414 513
415 if (file->f_flags & O_SYNC) { 514 if (file->f_flags & O_SYNC) {
416 flags = _PAGE_CACHE_UC; 515 flags = _PAGE_CACHE_UC_MINUS;
417 } 516 }
418 517
419#ifdef CONFIG_X86_32 518#ifdef CONFIG_X86_32
@@ -436,13 +535,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
436#endif 535#endif
437 536
438 /* 537 /*
439 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 538 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
539 *
440 * Without O_SYNC, we want to get 540 * Without O_SYNC, we want to get
441 * - WB for WB-able memory and no other conflicting mappings 541 * - WB for WB-able memory and no other conflicting mappings
442 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 542 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
443 * - Inherit from confliting mappings otherwise 543 * - Inherit from confliting mappings otherwise
444 */ 544 */
445 if (flags != _PAGE_CACHE_UC_MINUS) { 545 if (flags != -1) {
446 retval = reserve_memtype(offset, offset + size, flags, NULL); 546 retval = reserve_memtype(offset, offset + size, flags, NULL);
447 } else { 547 } else {
448 retval = reserve_memtype(offset, offset + size, -1, &flags); 548 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -470,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
470 570
471void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
472{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
473 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
474 unsigned long flags; 575 unsigned long flags;
475 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
476 576
477 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
478 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -492,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
492 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
493} 593}
494 594
495#if defined(CONFIG_DEBUG_FS) 595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
496 596
497/* get Nth element of the linked list */ 597/* get Nth element of the linked list */
498static struct memtype *memtype_get_idx(loff_t pos) 598static struct memtype *memtype_get_idx(loff_t pos)
@@ -515,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
515 } 615 }
516 spin_unlock(&memtype_lock); 616 spin_unlock(&memtype_lock);
517 kfree(print_entry); 617 kfree(print_entry);
618
518 return NULL; 619 return NULL;
519} 620}
520 621
@@ -545,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
545 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
546 print_entry->start, print_entry->end); 647 print_entry->start, print_entry->end);
547 kfree(print_entry); 648 kfree(print_entry);
649
548 return 0; 650 return 0;
549} 651}
550 652
@@ -576,4 +678,4 @@ static int __init pat_memtype_list_init(void)
576 678
577late_initcall(pat_memtype_list_init); 679late_initcall(pat_memtype_list_init);
578 680
579#endif /* CONFIG_DEBUG_FS */ 681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef8..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
63#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
65 65
66static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
67{ 67{
68 pgd_t *pgd = p;
69
70 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
71 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
72 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
87 pgd_list_add(pgd); 85 pgd_list_add(pgd);
88} 86}
89 87
90static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
91{ 89{
92 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
93 91
@@ -207,6 +205,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
207 unsigned long addr; 205 unsigned long addr;
208 int i; 206 int i;
209 207
208 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
209 return;
210
210 pud = pud_offset(pgd, 0); 211 pud = pud_offset(pgd, 0);
211 212
212 for (addr = i = 0; i < PREALLOCATED_PMDS; 213 for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
@@ -170,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
170 if (!arg) 123 if (!arg)
171 return -EINVAL; 124 return -EINVAL;
172 125
173 __VMALLOC_RESERVE = memparse(arg, &arg); 126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
174 return 0; 128 return 0;
175} 129}
176early_param("vmalloc", parse_vmalloc); 130early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301c..16ae70fc57e7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
178 * start of the node, and that the current "end" address is after 178 * start of the node, and that the current "end" address is after
179 * the previous one. 179 * the previous one.
180 */ 180 */
181static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) 181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{ 182{
183 /* 183 /*
184 * Only add present memory as told by the e820. 184 * Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
189 if (memory_chunk->start_pfn >= max_pfn) { 189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", 190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn); 191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return; 192 return -1;
193 } 193 }
194 if (memory_chunk->nid != nid) 194 if (memory_chunk->nid != nid)
195 return; 195 return -1;
196 196
197 if (!node_has_online_mem(nid)) 197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn; 198 node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
202 202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn) 203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn; 204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
205} 207}
206 208
207int __init get_memcfg_from_srat(void) 209int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
259 printk(KERN_DEBUG 261 printk(KERN_DEBUG
260 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
261 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
262 node_read_chunk(chunk->nid, chunk); 264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
263 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
264 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
265 } 269 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 287513a09819..57f6c9088081 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/kdebug.h> 18#include <linux/kdebug.h>
19#include <linux/cpu.h>
19#include <asm/nmi.h> 20#include <asm/nmi.h>
20#include <asm/msr.h> 21#include <asm/msr.h>
21#include <asm/apic.h> 22#include <asm/apic.h>
@@ -29,23 +30,48 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
29 30
30static int nmi_start(void); 31static int nmi_start(void);
31static void nmi_stop(void); 32static void nmi_stop(void);
33static void nmi_cpu_start(void *dummy);
34static void nmi_cpu_stop(void *dummy);
32 35
33/* 0 == registered but off, 1 == registered and on */ 36/* 0 == registered but off, 1 == registered and on */
34static int nmi_enabled = 0; 37static int nmi_enabled = 0;
35 38
39#ifdef CONFIG_SMP
40static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
41 void *data)
42{
43 int cpu = (unsigned long)data;
44 switch (action) {
45 case CPU_DOWN_FAILED:
46 case CPU_ONLINE:
47 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
48 break;
49 case CPU_DOWN_PREPARE:
50 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
51 break;
52 }
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block oprofile_cpu_nb = {
57 .notifier_call = oprofile_cpu_notifier
58};
59#endif
60
36#ifdef CONFIG_PM 61#ifdef CONFIG_PM
37 62
38static int nmi_suspend(struct sys_device *dev, pm_message_t state) 63static int nmi_suspend(struct sys_device *dev, pm_message_t state)
39{ 64{
65 /* Only one CPU left, just stop that one */
40 if (nmi_enabled == 1) 66 if (nmi_enabled == 1)
41 nmi_stop(); 67 nmi_cpu_stop(NULL);
42 return 0; 68 return 0;
43} 69}
44 70
45static int nmi_resume(struct sys_device *dev) 71static int nmi_resume(struct sys_device *dev)
46{ 72{
47 if (nmi_enabled == 1) 73 if (nmi_enabled == 1)
48 nmi_start(); 74 nmi_cpu_start(NULL);
49 return 0; 75 return 0;
50} 76}
51 77
@@ -270,10 +296,12 @@ static void nmi_cpu_shutdown(void *dummy)
270 296
271static void nmi_shutdown(void) 297static void nmi_shutdown(void)
272{ 298{
273 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 299 struct op_msrs *msrs;
300
274 nmi_enabled = 0; 301 nmi_enabled = 0;
275 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 302 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
276 unregister_die_notifier(&profile_exceptions_nb); 303 unregister_die_notifier(&profile_exceptions_nb);
304 msrs = &get_cpu_var(cpu_msrs);
277 model->shutdown(msrs); 305 model->shutdown(msrs);
278 free_msrs(); 306 free_msrs();
279 put_cpu_var(cpu_msrs); 307 put_cpu_var(cpu_msrs);
@@ -468,6 +496,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
468 return -ENODEV; 496 return -ENODEV;
469 } 497 }
470 498
499#ifdef CONFIG_SMP
500 register_cpu_notifier(&oprofile_cpu_nb);
501#endif
471 /* default values, can be overwritten by model */ 502 /* default values, can be overwritten by model */
472 ops->create_files = nmi_create_files; 503 ops->create_files = nmi_create_files;
473 ops->setup = nmi_setup; 504 ops->setup = nmi_setup;
@@ -489,8 +520,12 @@ int __init op_nmi_init(struct oprofile_operations *ops)
489 520
490void op_nmi_exit(void) 521void op_nmi_exit(void)
491{ 522{
492 if (using_nmi) 523 if (using_nmi) {
493 exit_sysfs(); 524 exit_sysfs();
525#ifdef CONFIG_SMP
526 unregister_cpu_notifier(&oprofile_cpu_nb);
527#endif
528 }
494 if (model->exit) 529 if (model->exit)
495 model->exit(); 530 model->exit();
496} 531}
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f47..43ac5af338d8 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
10 10
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h>
14#include <linux/nmi.h>
13#include <asm/msr.h> 15#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h> 16#include <asm/fixmap.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18
18 19
19#include "op_x86_model.h" 20#include "op_x86_model.h"
20#include "op_counter.h" 21#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
40static inline void setup_num_counters(void) 41static inline void setup_num_counters(void)
41{ 42{
42#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){ 44 if (smp_num_siblings == 2) {
44 num_counters = NUM_COUNTERS_HT2; 45 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2; 46 num_controls = NUM_CONTROLS_HT2;
46 } 47 }
@@ -86,7 +87,7 @@ struct p4_event_binding {
86#define CTR_FLAME_2 (1 << 6) 87#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7) 88#define CTR_IQ_5 (1 << 7)
88 89
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { 90static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, 91 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, 92 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, 93 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } 98 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98}; 99};
99 100
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT 101#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
101 102
102/* p4 event codes in libop/op_event.h are indices into this table. */ 103/* p4 event codes in libop/op_event.h are indices into this table. */
103 104
104static struct p4_event_binding p4_events[NUM_EVENTS] = { 105static struct p4_event_binding p4_events[NUM_EVENTS] = {
105 106
106 { /* BRANCH_RETIRED */ 107 { /* BRANCH_RETIRED */
107 0x05, 0x06, 108 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, 109 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} } 110 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 }, 111 },
111 112
112 { /* MISPRED_BRANCH_RETIRED */ 113 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03, 114 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 115 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 116 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 }, 117 },
117 118
118 { /* TC_DELIVER_MODE */ 119 { /* TC_DELIVER_MODE */
119 0x01, 0x01, 120 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0}, 121 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} } 122 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 }, 123 },
123 124
124 { /* BPU_FETCH_REQUEST */ 125 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03, 126 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, 127 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} } 128 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 }, 129 },
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
146 }, 147 },
147 148
148 { /* LOAD_PORT_REPLAY */ 149 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04, 150 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, 151 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } 152 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 }, 153 },
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
170 }, 171 },
171 172
172 { /* BSQ_CACHE_REFERENCE */ 173 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c, 174 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 175 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} } 176 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 }, 177 },
177 178
178 { /* IOQ_ALLOCATION */ 179 { /* IOQ_ALLOCATION */
179 0x06, 0x03, 180 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 181 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } } 182 { 0, 0 } }
182 }, 183 },
183 184
184 { /* IOQ_ACTIVE_ENTRIES */ 185 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a, 186 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, 187 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } } 188 { 0, 0 } }
188 }, 189 },
189 190
190 { /* FSB_DATA_ACTIVITY */ 191 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17, 192 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 193 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 194 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 }, 195 },
195 196
196 { /* BSQ_ALLOCATION */ 197 { /* BSQ_ALLOCATION */
197 0x07, 0x05, 198 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 199 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } } 200 { 0, 0 } }
200 }, 201 },
201 202
202 { /* BSQ_ACTIVE_ENTRIES */ 203 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06, 204 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, 205 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } } 206 { 0, 0 } }
206 }, 207 },
207 208
208 { /* X87_ASSIST */ 209 { /* X87_ASSIST */
209 0x05, 0x03, 210 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 211 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 212 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 }, 213 },
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 217 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 218 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 }, 219 },
219 220
220 { /* PACKED_SP_UOP */ 221 { /* PACKED_SP_UOP */
221 0x01, 0x08, 222 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 223 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 224 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 }, 225 },
225 226
226 { /* PACKED_DP_UOP */ 227 { /* PACKED_DP_UOP */
227 0x01, 0x0c, 228 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 229 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 230 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 }, 231 },
231 232
232 { /* SCALAR_SP_UOP */ 233 { /* SCALAR_SP_UOP */
233 0x01, 0x0a, 234 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 235 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 236 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 }, 237 },
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
242 }, 243 },
243 244
244 { /* 64BIT_MMX_UOP */ 245 { /* 64BIT_MMX_UOP */
245 0x01, 0x02, 246 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 247 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 248 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 }, 249 },
249 250
250 { /* 128BIT_MMX_UOP */ 251 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a, 252 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 253 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 254 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 }, 255 },
255 256
256 { /* X87_FP_UOP */ 257 { /* X87_FP_UOP */
257 0x01, 0x04, 258 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 259 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 260 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 }, 261 },
261 262
262 { /* X87_SIMD_MOVES_UOP */ 263 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e, 264 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 265 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 266 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 }, 267 },
267 268
268 { /* MACHINE_CLEAR */ 269 { /* MACHINE_CLEAR */
269 0x05, 0x02, 270 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 271 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 272 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 }, 273 },
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 277 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 278 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 }, 279 },
279 280
280 { /* TC_MS_XFER */ 281 { /* TC_MS_XFER */
281 0x00, 0x05, 282 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0}, 283 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} } 284 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 }, 285 },
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
308 }, 309 },
309 310
310 { /* INSTR_RETIRED */ 311 { /* INSTR_RETIRED */
311 0x04, 0x02, 312 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 313 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 314 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 }, 315 },
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 320 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 }, 321 },
321 322
322 { /* UOP_TYPE */ 323 { /* UOP_TYPE */
323 0x02, 0x02, 324 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, 325 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} } 326 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 }, 327 },
327 328
328 { /* RETIRED_MISPRED_BRANCH_TYPE */ 329 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05, 330 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, 331 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} } 332 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 }, 333 },
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354 355
355#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367 368
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) 371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) 372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) 373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373 374
374 375
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
380#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
381 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
382 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
383#endif 384#endif
384 return 0; 385 return 0;
385} 386}
386 387
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
395 396
396static void p4_fill_in_addresses(struct op_msrs * const msrs) 397static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{ 398{
398 unsigned int i; 399 unsigned int i;
399 unsigned int addr, cccraddr, stag; 400 unsigned int addr, cccraddr, stag;
400 401
401 setup_num_counters(); 402 setup_num_counters();
402 stag = get_stagger(); 403 stag = get_stagger();
403 404
404 /* initialize some registers */ 405 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) { 406 for (i = 0; i < num_counters; ++i)
406 msrs->counters[i].addr = 0; 407 msrs->counters[i].addr = 0;
407 } 408 for (i = 0; i < num_controls; ++i)
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0; 409 msrs->controls[i].addr = 0;
410 } 410
411
412 /* the counter & cccr registers we pay attention to */ 411 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) { 412 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 413 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; 414 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){ 415 if (reserve_perfctr_nmi(addr)) {
417 msrs->counters[i].addr = addr; 416 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr; 417 msrs->controls[i].addr = cccraddr;
419 } 418 }
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
447 if (reserve_evntsel_nmi(addr)) 446 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr; 447 msrs->controls[i].addr = addr;
449 } 448 }
450 449
451 for (addr = MSR_P4_MS_ESCR0 + stag; 450 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 451 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr)) 452 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr; 453 msrs->controls[i].addr = addr;
455 } 454 }
456 455
457 for (addr = MSR_P4_IX_ESCR0 + stag; 456 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 457 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr)) 458 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr; 459 msrs->controls[i].addr = addr;
461 } 460 }
462 461
463 /* there are 2 remaining non-contiguously located ESCRs */ 462 /* there are 2 remaining non-contiguously located ESCRs */
464 463
465 if (num_counters == NUM_COUNTERS_NON_HT) { 464 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/ 465 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) 466 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 467 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
498 unsigned int stag; 497 unsigned int stag;
499 498
500 stag = get_stagger(); 499 stag = get_stagger();
501 500
502 /* convert from counter *number* to counter *bit* */ 501 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr); 502 counter_bit = 1 << VIRT_CTR(stag, ctr);
504 503
505 /* find our event binding structure. */ 504 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { 505 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR 506 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n", 507 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event); 508 counter_config[ctr].event);
510 return; 509 return;
511 } 510 }
512 511
513 ev = &(p4_events[counter_config[ctr].event - 1]); 512 ev = &(p4_events[counter_config[ctr].event - 1]);
514 513
515 for (i = 0; i < maxbind; i++) { 514 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) { 515 if (ev->bindings[i].virt_counter & counter_bit) {
517 516
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel); 525 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 } 526 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 527 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i); 529 ESCR_WRITE(escr, high, ev, i);
531 530
532 /* modify CCCR */ 531 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr); 533 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr); 534 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) { 536 if (stag == 0)
538 CCCR_SET_PMI_OVF_0(cccr); 537 CCCR_SET_PMI_OVF_0(cccr);
539 } else { 538 else
540 CCCR_SET_PMI_OVF_1(cccr); 539 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return; 541 return;
544 } 542 }
545 } 543 }
546 544
547 printk(KERN_ERR 545 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", 546 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr); 547 counter_config[ctr].event, stag, ctr);
550} 548}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
559 stag = get_stagger(); 557 stag = get_stagger();
560 558
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 559 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) { 560 if (!MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n"); 561 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return; 562 return;
565 } 563 }
566 564
567 /* clear the cccrs we will use */ 565 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) { 566 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 567 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
570 continue; 568 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low); 570 CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
576 574
577 /* clear all escrs (including those outside our concern) */ 575 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) { 576 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 577 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
580 continue; 578 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0); 579 wrmsr(msrs->controls[i].addr, 0, 0);
582 } 580 }
583 581
584 /* setup all counters */ 582 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) { 583 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { 584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
587 reset_value[i] = counter_config[i].count; 585 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i); 586 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
603 stag = get_stagger(); 601 stag = get_stagger();
604 602
605 for (i = 0; i < num_counters; ++i) { 603 for (i = 0; i < num_counters; ++i) {
606 604
607 if (!reset_value[i]) 605 if (!reset_value[i])
608 continue; 606 continue;
609 607
610 /* 608 /*
611 * there is some eccentricity in the hardware which 609 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections: 610 * requires that we perform 2 extra corrections:
613 * 611 *
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
616 * 614 *
617 * - write the counter back twice to ensure it gets 615 * - write the counter back twice to ensure it gets
618 * updated properly. 616 * updated properly.
619 * 617 *
620 * the former seems to be related to extra NMIs happening 618 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata 619 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification 620 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not 621 * update, though their suggested work-around does not
624 * appear to solve the problem. 622 * appear to solve the problem.
625 */ 623 */
626 624
627 real = VIRT_CTR(stag, i); 625 real = VIRT_CTR(stag, i);
628 626
629 CCCR_READ(low, high, real); 627 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real); 628 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i); 630 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real); 631 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low); 632 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real); 633 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real); 634 CTR_WRITE(reset_value[i], real);
637 } 635 }
638 } 636 }
639 637
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
683 int i; 681 int i;
684 682
685 for (i = 0 ; i < num_counters ; ++i) { 683 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i)) 684 if (CTR_IS_RESERVED(msrs, i))
687 release_perfctr_nmi(msrs->counters[i].addr); 685 release_perfctr_nmi(msrs->counters[i].addr);
688 } 686 }
689 /* some of the control registers are specially reserved in 687 /*
688 * some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset). 689 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits. 690 * This saves a few bits.
692 */ 691 */
693 for (i = num_counters ; i < num_controls ; ++i) { 692 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i)) 693 if (CTRL_IS_RESERVED(msrs, i))
695 release_evntsel_nmi(msrs->controls[i].addr); 694 release_evntsel_nmi(msrs->controls[i].addr);
696 } 695 }
697} 696}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af06927fbc..1d88d2b39771 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void)
250 acpi_pci_irq_enable(dev); 250 acpi_pci_irq_enable(dev);
251 } 251 }
252 252
253#ifdef CONFIG_X86_IO_APIC
254 if (acpi_ioapic)
255 print_IO_APIC();
256#endif
257
258 return 0; 253 return 0;
259} 254}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index dbf532369711..22e057665e55 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,6 +1,7 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h>
4#include "pci.h" 5#include "pci.h"
5 6
6#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
@@ -555,15 +556,17 @@ static int __init early_fill_mp_bus_info(void)
555 return 0; 556 return 0;
556} 557}
557 558
558postcore_initcall(early_fill_mp_bus_info); 559#else /* !CONFIG_X86_64 */
559 560
560#endif 561static int __init early_fill_mp_bus_info(void) { return 0; }
562
563#endif /* !CONFIG_X86_64 */
561 564
562/* common 32/64 bit code */ 565/* common 32/64 bit code */
563 566
564#define ENABLE_CF8_EXT_CFG (1ULL << 46) 567#define ENABLE_CF8_EXT_CFG (1ULL << 46)
565 568
566static void enable_pci_io_ecs_per_cpu(void *unused) 569static void enable_pci_io_ecs(void *unused)
567{ 570{
568 u64 reg; 571 u64 reg;
569 rdmsrl(MSR_AMD64_NB_CFG, reg); 572 rdmsrl(MSR_AMD64_NB_CFG, reg);
@@ -573,14 +576,51 @@ static void enable_pci_io_ecs_per_cpu(void *unused)
573 } 576 }
574} 577}
575 578
576static int __init enable_pci_io_ecs(void) 579static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu)
577{ 581{
582 int cpu = (long)hcpu;
583 switch (action) {
584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
587 break;
588 default:
589 break;
590 }
591 return NOTIFY_OK;
592}
593
594static struct notifier_block __cpuinitdata amd_cpu_notifier = {
595 .notifier_call = amd_cpu_notify,
596};
597
598static int __init pci_io_ecs_init(void)
599{
600 int cpu;
601
578 /* assume all cpus from fam10h have IO ECS */ 602 /* assume all cpus from fam10h have IO ECS */
579 if (boot_cpu_data.x86 < 0x10) 603 if (boot_cpu_data.x86 < 0x10)
580 return 0; 604 return 0;
581 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); 605
606 register_cpu_notifier(&amd_cpu_notifier);
607 for_each_online_cpu(cpu)
608 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
609 (void *)(long)cpu);
582 pci_probe |= PCI_HAS_IO_ECS; 610 pci_probe |= PCI_HAS_IO_ECS;
611
612 return 0;
613}
614
615static int __init amd_postcore_init(void)
616{
617 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
618 return 0;
619
620 early_fill_mp_bus_info();
621 pci_io_ecs_init();
622
583 return 0; 623 return 0;
584} 624}
585 625
586postcore_initcall(enable_pci_io_ecs); 626postcore_initcall(amd_postcore_init);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a336342..4bdaa590375d 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb);
27 if (busno) 28 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */ 29 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb) 30 if (suba < subb)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a09505806b82..844df0cbbd3e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h>
36 37
37#include "pci.h" 38#include "pci.h"
38 39
@@ -128,10 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
128 pr = pci_find_parent_resource(dev, r); 129 pr = pci_find_parent_resource(dev, r);
129 if (!r->start || !pr || 130 if (!r->start || !pr ||
130 request_resource(pr, r) < 0) { 131 request_resource(pr, r) < 0) {
131 printk(KERN_ERR "PCI: Cannot allocate " 132 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
132 "resource region %d "
133 "of bridge %s\n",
134 idx, pci_name(dev));
135 /* 133 /*
136 * Something is wrong with the region. 134 * Something is wrong with the region.
137 * Invalidate the resource to prevent 135 * Invalidate the resource to prevent
@@ -166,15 +164,13 @@ static void __init pcibios_allocate_resources(int pass)
166 else 164 else
167 disabled = !(command & PCI_COMMAND_MEMORY); 165 disabled = !(command & PCI_COMMAND_MEMORY);
168 if (pass == disabled) { 166 if (pass == disabled) {
169 DBG("PCI: Resource %08lx-%08lx " 167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
170 "(f=%lx, d=%d, p=%d)\n", 168 (unsigned long long) r->start,
171 r->start, r->end, r->flags, disabled, pass); 169 (unsigned long long) r->end,
170 r->flags, disabled, pass);
172 pr = pci_find_parent_resource(dev, r); 171 pr = pci_find_parent_resource(dev, r);
173 if (!pr || request_resource(pr, r) < 0) { 172 if (!pr || request_resource(pr, r) < 0) {
174 printk(KERN_ERR "PCI: Cannot allocate " 173 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
175 "resource region %d "
176 "of device %s\n",
177 idx, pci_name(dev));
178 /* We'll assign a new address later */ 174 /* We'll assign a new address later */
179 r->end -= r->start; 175 r->end -= r->start;
180 r->start = 0; 176 r->start = 0;
@@ -187,8 +183,7 @@ static void __init pcibios_allocate_resources(int pass)
187 /* Turn the ROM off, leave the resource region, 183 /* Turn the ROM off, leave the resource region,
188 * but keep it unregistered. */ 184 * but keep it unregistered. */
189 u32 reg; 185 u32 reg;
190 DBG("PCI: Switching off ROM of %s\n", 186 dev_dbg(&dev->dev, "disabling ROM\n");
191 pci_name(dev));
192 r->flags &= ~IORESOURCE_ROM_ENABLE; 187 r->flags &= ~IORESOURCE_ROM_ENABLE;
193 pci_read_config_dword(dev, 188 pci_read_config_dword(dev,
194 dev->rom_base_reg, &reg); 189 dev->rom_base_reg, &reg);
@@ -233,6 +228,8 @@ void __init pcibios_resource_survey(void)
233 pcibios_allocate_bus_resources(&pci_root_buses); 228 pcibios_allocate_bus_resources(&pci_root_buses);
234 pcibios_allocate_resources(0); 229 pcibios_allocate_resources(0);
235 pcibios_allocate_resources(1); 230 pcibios_allocate_resources(1);
231
232 e820_reserve_resources_late();
236} 233}
237 234
238/** 235/**
@@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev)
257 lat = pcibios_max_latency; 254 lat = pcibios_max_latency;
258 else 255 else
259 return; 256 return;
260 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", 257 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
261 pci_name(dev), lat);
262 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
263} 259}
264 260
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 6a06a2eb0597..006599db0dc7 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
436{ 436{
437 WARN_ON_ONCE(pirq >= 9); 437 WARN_ON_ONCE(pirq >= 9);
438 if (pirq > 8) { 438 if (pirq > 8) {
439 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 439 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
440 return 0; 440 return 0;
441 } 441 }
442 return read_config_nybble(router, 0x74, pirq-1); 442 return read_config_nybble(router, 0x74, pirq-1);
@@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
446{ 446{
447 WARN_ON_ONCE(pirq >= 9); 447 WARN_ON_ONCE(pirq >= 9);
448 if (pirq > 8) { 448 if (pirq > 8) {
449 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 449 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
450 return 0; 450 return 0;
451 } 451 }
452 write_config_nybble(router, 0x74, pirq-1, irq); 452 write_config_nybble(router, 0x74, pirq-1, irq);
@@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
492 irq = 0; 492 irq = 0;
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 495 dev_info(&dev->dev,
496 dev->vendor, dev->device, pirq, irq); 496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq);
497 return irq; 498 return irq;
498} 499}
499 500
500static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
501{ 502{
502 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 503 dev_info(&dev->dev,
503 dev->vendor, dev->device, pirq, irq); 504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq);
504 if (pirq <= 4) 506 if (pirq <= 4)
505 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
506 return 1; 508 return 1;
@@ -588,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
588 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
589 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
590 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_PCH_0:
594 case PCI_DEVICE_ID_INTEL_PCH_1:
591 r->name = "PIIX/ICH"; 595 r->name = "PIIX/ICH";
592 r->get = pirq_piix_get; 596 r->get = pirq_piix_get;
593 r->set = pirq_piix_set; 597 r->set = pirq_piix_set;
@@ -730,7 +734,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
730 switch (device) { 734 switch (device) {
731 case PCI_DEVICE_ID_AL_M1533: 735 case PCI_DEVICE_ID_AL_M1533:
732 case PCI_DEVICE_ID_AL_M1563: 736 case PCI_DEVICE_ID_AL_M1563:
733 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
734 r->name = "ALI"; 737 r->name = "ALI";
735 r->get = pirq_ali_get; 738 r->get = pirq_ali_get;
736 r->set = pirq_ali_set; 739 r->set = pirq_ali_set;
@@ -840,11 +843,9 @@ static void __init pirq_find_router(struct irq_router *r)
840 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 843 h->probe(r, pirq_router_dev, pirq_router_dev->device))
841 break; 844 break;
842 } 845 }
843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 846 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
844 pirq_router.name, 847 pirq_router.name,
845 pirq_router_dev->vendor, 848 pirq_router_dev->vendor, pirq_router_dev->device);
846 pirq_router_dev->device,
847 pci_name(pirq_router_dev));
848 849
849 /* The device remains referenced for the kernel lifetime */ 850 /* The device remains referenced for the kernel lifetime */
850} 851}
@@ -877,7 +878,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
877 /* Find IRQ pin */ 878 /* Find IRQ pin */
878 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 879 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
879 if (!pin) { 880 if (!pin) {
880 DBG(KERN_DEBUG " -> no interrupt pin\n"); 881 dev_dbg(&dev->dev, "no interrupt pin\n");
881 return 0; 882 return 0;
882 } 883 }
883 pin = pin - 1; 884 pin = pin - 1;
@@ -887,20 +888,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
887 if (!pirq_table) 888 if (!pirq_table)
888 return 0; 889 return 0;
889 890
890 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
891 info = pirq_get_info(dev); 891 info = pirq_get_info(dev);
892 if (!info) { 892 if (!info) {
893 DBG(" -> not found in routing table\n" KERN_DEBUG); 893 dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
894 'A' + pin);
894 return 0; 895 return 0;
895 } 896 }
896 pirq = info->irq[pin].link; 897 pirq = info->irq[pin].link;
897 mask = info->irq[pin].bitmap; 898 mask = info->irq[pin].bitmap;
898 if (!pirq) { 899 if (!pirq) {
899 DBG(" -> not routed\n" KERN_DEBUG); 900 dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
900 return 0; 901 return 0;
901 } 902 }
902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, 903 dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
903 pirq_table->exclusive_irqs); 904 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
904 mask &= pcibios_irq_mask; 905 mask &= pcibios_irq_mask;
905 906
906 /* Work around broken HP Pavilion Notebooks which assign USB to 907 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -930,10 +931,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
930 if (pci_probe & PCI_USE_PIRQ_MASK) 931 if (pci_probe & PCI_USE_PIRQ_MASK)
931 newirq = 0; 932 newirq = 0;
932 else 933 else
933 printk("\n" KERN_WARNING 934 dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" 935 "%#x; try pci=usepirqmask\n", newirq, mask);
935 KERN_DEBUG, newirq,
936 pci_name(dev));
937 } 936 }
938 if (!newirq && assign) { 937 if (!newirq && assign) {
939 for (i = 0; i < 16; i++) { 938 for (i = 0; i < 16; i++) {
@@ -944,39 +943,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
944 newirq = i; 943 newirq = i;
945 } 944 }
946 } 945 }
947 DBG(" -> newirq=%d", newirq); 946 dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
948 947
949 /* Check if it is hardcoded */ 948 /* Check if it is hardcoded */
950 if ((pirq & 0xf0) == 0xf0) { 949 if ((pirq & 0xf0) == 0xf0) {
951 irq = pirq & 0xf; 950 irq = pirq & 0xf;
952 DBG(" -> hardcoded IRQ %d\n", irq); 951 msg = "hardcoded";
953 msg = "Hardcoded";
954 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 952 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
955 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { 953 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
956 DBG(" -> got IRQ %d\n", irq); 954 msg = "found";
957 msg = "Found";
958 eisa_set_level_irq(irq); 955 eisa_set_level_irq(irq);
959 } else if (newirq && r->set && 956 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 957 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
961 DBG(" -> assigning IRQ %d", newirq);
962 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 958 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
963 eisa_set_level_irq(newirq); 959 eisa_set_level_irq(newirq);
964 DBG(" ... OK\n"); 960 msg = "assigned";
965 msg = "Assigned";
966 irq = newirq; 961 irq = newirq;
967 } 962 }
968 } 963 }
969 964
970 if (!irq) { 965 if (!irq) {
971 DBG(" ... failed\n");
972 if (newirq && mask == (1 << newirq)) { 966 if (newirq && mask == (1 << newirq)) {
973 msg = "Guessed"; 967 msg = "guessed";
974 irq = newirq; 968 irq = newirq;
975 } else 969 } else {
970 dev_dbg(&dev->dev, "can't route interrupt\n");
976 return 0; 971 return 0;
972 }
977 } 973 }
978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, 974 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
979 pci_name(dev));
980 975
981 /* Update IRQ for all devices with the same pirq value */ 976 /* Update IRQ for all devices with the same pirq value */
982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 977 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -996,17 +991,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 991 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
997 ((1 << dev2->irq) & mask))) { 992 ((1 << dev2->irq) & mask))) {
998#ifndef CONFIG_PCI_MSI 993#ifndef CONFIG_PCI_MSI
999 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 994 dev_info(&dev2->dev, "IRQ routing conflict: "
1000 pci_name(dev2), dev2->irq, irq); 995 "have IRQ %d, want IRQ %d\n",
996 dev2->irq, irq);
1001#endif 997#endif
1002 continue; 998 continue;
1003 } 999 }
1004 dev2->irq = irq; 1000 dev2->irq = irq;
1005 pirq_penalty[irq]++; 1001 pirq_penalty[irq]++;
1006 if (dev != dev2) 1002 if (dev != dev2)
1007 printk(KERN_INFO 1003 dev_info(&dev->dev, "sharing IRQ %d with %s\n",
1008 "PCI: Sharing IRQ %d with %s\n", 1004 irq, pci_name(dev2));
1009 irq, pci_name(dev2));
1010 } 1005 }
1011 } 1006 }
1012 return 1; 1007 return 1;
@@ -1025,8 +1020,7 @@ static void __init pcibios_fixup_irqs(void)
1025 * already in use. 1020 * already in use.
1026 */ 1021 */
1027 if (dev->irq >= 16) { 1022 if (dev->irq >= 16) {
1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", 1023 dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
1029 pci_name(dev), dev->irq);
1030 dev->irq = 0; 1024 dev->irq = 0;
1031 } 1025 }
1032 /* 1026 /*
@@ -1049,35 +1043,44 @@ static void __init pcibios_fixup_irqs(void)
1049 if (io_apic_assign_pci_irqs) { 1043 if (io_apic_assign_pci_irqs) {
1050 int irq; 1044 int irq;
1051 1045
1052 if (pin) { 1046 if (!pin)
1053 /* 1047 continue;
1054 * interrupt pins are numbered starting 1048
1055 * from 1 1049 /*
1056 */ 1050 * interrupt pins are numbered starting from 1
1057 pin--; 1051 */
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, 1052 pin--;
1059 PCI_SLOT(dev->devfn), pin); 1053 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1060 /* 1054 PCI_SLOT(dev->devfn), pin);
1061 * Busses behind bridges are typically not listed in the MP-table. 1055 /*
1062 * In this case we have to look up the IRQ based on the parent bus, 1056 * Busses behind bridges are typically not listed in the
1063 * parent slot, and pin number. The SMP code detects such bridged 1057 * MP-table. In this case we have to look up the IRQ
1064 * busses itself so we should get into this branch reliably. 1058 * based on the parent bus, parent slot, and pin number.
1065 */ 1059 * The SMP code detects such bridged busses itself so we
1066 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1060 * should get into this branch reliably.
1067 struct pci_dev *bridge = dev->bus->self; 1061 */
1068 1062 if (irq < 0 && dev->bus->parent) {
1069 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1063 /* go back to the bridge */
1070 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1064 struct pci_dev *bridge = dev->bus->self;
1071 PCI_SLOT(bridge->devfn), pin); 1065 int bus;
1072 if (irq >= 0) 1066
1073 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1067 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1074 pci_name(bridge), 'A' + pin, irq); 1068 bus = bridge->bus->number;
1075 } 1069 irq = IO_APIC_get_PCI_irq_vector(bus,
1076 if (irq >= 0) { 1070 PCI_SLOT(bridge->devfn), pin);
1077 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1071 if (irq >= 0)
1078 pci_name(dev), 'A' + pin, irq); 1072 dev_warn(&dev->dev,
1079 dev->irq = irq; 1073 "using bridge %s INT %c to "
1080 } 1074 "get IRQ %d\n",
1075 pci_name(bridge),
1076 'A' + pin, irq);
1077 }
1078 if (irq >= 0) {
1079 dev_info(&dev->dev,
1080 "PCI->APIC IRQ transform: INT %c "
1081 "-> IRQ %d\n",
1082 'A' + pin, irq);
1083 dev->irq = irq;
1081 } 1084 }
1082 } 1085 }
1083#endif 1086#endif
@@ -1231,25 +1234,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1234 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1232 PCI_SLOT(bridge->devfn), pin); 1235 PCI_SLOT(bridge->devfn), pin);
1233 if (irq >= 0) 1236 if (irq >= 0)
1234 printk(KERN_WARNING 1237 dev_warn(&dev->dev, "using bridge %s "
1235 "PCI: using PPB %s[%c] to get irq %d\n", 1238 "INT %c to get IRQ %d\n",
1236 pci_name(bridge), 1239 pci_name(bridge), 'A' + pin,
1237 'A' + pin, irq); 1240 irq);
1238 dev = bridge; 1241 dev = bridge;
1239 } 1242 }
1240 dev = temp_dev; 1243 dev = temp_dev;
1241 if (irq >= 0) { 1244 if (irq >= 0) {
1242 printk(KERN_INFO 1245 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1246 "INT %c -> IRQ %d\n", 'A' + pin, irq);
1244 pci_name(dev), 'A' + pin, irq);
1245 dev->irq = irq; 1247 dev->irq = irq;
1246 return 0; 1248 return 0;
1247 } else 1249 } else
1248 msg = " Probably buggy MP table."; 1250 msg = "; probably buggy MP table";
1249 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1251 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1250 msg = ""; 1252 msg = "";
1251 else 1253 else
1252 msg = " Please try using pci=biosirq."; 1254 msg = "; please try using pci=biosirq";
1253 1255
1254 /* 1256 /*
1255 * With IDE legacy devices the IRQ lookup failure is not 1257 * With IDE legacy devices the IRQ lookup failure is not
@@ -1259,9 +1261,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1259 !(dev->class & 0x5)) 1261 !(dev->class & 0x5))
1260 return 0; 1262 return 0;
1261 1263
1262 printk(KERN_WARNING 1264 dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1265 'A' + pin, msg);
1264 'A' + pin, pci_name(dev), msg);
1265 } 1266 }
1266 return 0; 1267 return 0;
1267} 1268}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index ec9ce35e44d6..b722dd481b39 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void)
14 int n, devfn; 14 int n, devfn;
15 long node; 15 long node;
16 16
17 if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) 17 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
18 return; 18 return;
19 DBG("PCI: Peer bridge fixup\n"); 19 DBG("PCI: Peer bridge fixup\n");
20 20
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 23faaa890ffc..654a2234f8f3 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
209 return name != NULL; 209 return name != NULL;
210} 210}
211 211
212static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) 212static void __init pci_mmcfg_insert_resources(void)
213{ 213{
214#define PCI_MMCFG_RESOURCE_NAME_LEN 19 214#define PCI_MMCFG_RESOURCE_NAME_LEN 19
215 int i; 215 int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
233 cfg->pci_segment); 233 cfg->pci_segment);
234 res->start = cfg->address; 234 res->start = cfg->address;
235 res->end = res->start + (num_buses << 20) - 1; 235 res->end = res->start + (num_buses << 20) - 1;
236 res->flags = IORESOURCE_MEM | resource_flags; 236 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
237 insert_resource(&iomem_resource, res); 237 insert_resource(&iomem_resource, res);
238 names += PCI_MMCFG_RESOURCE_NAME_LEN; 238 names += PCI_MMCFG_RESOURCE_NAME_LEN;
239 } 239 }
@@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
293 return AE_OK; 293 return AE_OK;
294} 294}
295 295
296static int __init is_acpi_reserved(unsigned long start, unsigned long end) 296static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
297{ 297{
298 struct resource mcfg_res; 298 struct resource mcfg_res;
299 299
@@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end)
310 return mcfg_res.flags; 310 return mcfg_res.flags;
311} 311}
312 312
313typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
314
315static int __init is_mmconf_reserved(check_reserved_t is_reserved,
316 u64 addr, u64 size, int i,
317 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
318{
319 u64 old_size = size;
320 int valid = 0;
321
322 while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
323 size >>= 1;
324 if (size < (16UL<<20))
325 break;
326 }
327
328 if (size >= (16UL<<20) || size == old_size) {
329 printk(KERN_NOTICE
330 "PCI: MCFG area at %Lx reserved in %s\n",
331 addr, with_e820?"E820":"ACPI motherboard resources");
332 valid = 1;
333
334 if (old_size != size) {
335 /* update end_bus_number */
336 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1);
337 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx "
338 "segment %hu buses %u - %u\n",
339 i, (unsigned long)cfg->address, cfg->pci_segment,
340 (unsigned int)cfg->start_bus_number,
341 (unsigned int)cfg->end_bus_number);
342 }
343 }
344
345 return valid;
346}
347
313static void __init pci_mmcfg_reject_broken(int early) 348static void __init pci_mmcfg_reject_broken(int early)
314{ 349{
315 typeof(pci_mmcfg_config[0]) *cfg; 350 typeof(pci_mmcfg_config[0]) *cfg;
@@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early)
324 359
325 for (i = 0; i < pci_mmcfg_config_num; i++) { 360 for (i = 0; i < pci_mmcfg_config_num; i++) {
326 int valid = 0; 361 int valid = 0;
327 u32 size = (cfg->end_bus_number + 1) << 20; 362 u64 addr, size;
363
328 cfg = &pci_mmcfg_config[i]; 364 cfg = &pci_mmcfg_config[i];
365 addr = cfg->start_bus_number;
366 addr <<= 20;
367 addr += cfg->address;
368 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
369 size <<= 20;
329 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " 370 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
330 "segment %hu buses %u - %u\n", 371 "segment %hu buses %u - %u\n",
331 i, (unsigned long)cfg->address, cfg->pci_segment, 372 i, (unsigned long)cfg->address, cfg->pci_segment,
332 (unsigned int)cfg->start_bus_number, 373 (unsigned int)cfg->start_bus_number,
333 (unsigned int)cfg->end_bus_number); 374 (unsigned int)cfg->end_bus_number);
334 375
335 if (!early && 376 if (!early)
336 is_acpi_reserved(cfg->address, cfg->address + size - 1)) { 377 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
337 printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved "
338 "in ACPI motherboard resources\n",
339 cfg->address);
340 valid = 1;
341 }
342 378
343 if (valid) 379 if (valid)
344 continue; 380 continue;
@@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early)
347 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 383 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
348 " reserved in ACPI motherboard resources\n", 384 " reserved in ACPI motherboard resources\n",
349 cfg->address); 385 cfg->address);
386
350 /* Don't try to do this check unless configuration 387 /* Don't try to do this check unless configuration
351 type 1 is available. how about type 2 ?*/ 388 type 1 is available. how about type 2 ?*/
352 if (raw_pci_ops && e820_all_mapped(cfg->address, 389 if (raw_pci_ops)
353 cfg->address + size - 1, 390 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1);
354 E820_RESERVED)) {
355 printk(KERN_NOTICE
356 "PCI: MCFG area at %Lx reserved in E820\n",
357 cfg->address);
358 valid = 1;
359 }
360 391
361 if (!valid) 392 if (!valid)
362 goto reject; 393 goto reject;
@@ -365,7 +396,7 @@ static void __init pci_mmcfg_reject_broken(int early)
365 return; 396 return;
366 397
367reject: 398reject:
368 printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); 399 printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
369 pci_mmcfg_arch_free(); 400 pci_mmcfg_arch_free();
370 kfree(pci_mmcfg_config); 401 kfree(pci_mmcfg_config);
371 pci_mmcfg_config = NULL; 402 pci_mmcfg_config = NULL;
@@ -403,11 +434,9 @@ static void __init __pci_mmcfg_init(int early)
403 (pci_mmcfg_config[0].address == 0)) 434 (pci_mmcfg_config[0].address == 0))
404 return; 435 return;
405 436
406 if (pci_mmcfg_arch_init()) { 437 if (pci_mmcfg_arch_init())
407 if (known_bridge)
408 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
409 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 438 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
410 } else { 439 else {
411 /* 440 /*
412 * Signal not to attempt to insert mmcfg resources because 441 * Signal not to attempt to insert mmcfg resources because
413 * the architecture mmcfg setup could not initialize. 442 * the architecture mmcfg setup could not initialize.
@@ -444,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
444 * marked so it won't cause request errors when __request_region is 473 * marked so it won't cause request errors when __request_region is
445 * called. 474 * called.
446 */ 475 */
447 pci_mmcfg_insert_resources(0); 476 pci_mmcfg_insert_resources();
448 477
449 return 0; 478 return 0;
450} 479}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index f4b16dc11dad..1177845d3186 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
131 u8 busno, suba, subb; 131 u8 busno, suba, subb;
132 int quad = BUS2QUAD(d->bus->number); 132 int quad = BUS2QUAD(d->bus->number);
133 133
134 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 134 dev_info(&d->dev, "searching for i450NX host bridges\n");
135 reg = 0xd0; 135 reg = 0xd0;
136 for(pxb=0; pxb<2; pxb++) { 136 for(pxb=0; pxb<2; pxb++) {
137 pci_read_config_byte(d, reg++, &busno); 137 pci_read_config_byte(d, reg++, &busno);
138 pci_read_config_byte(d, reg++, &suba); 138 pci_read_config_byte(d, reg++, &suba);
139 pci_read_config_byte(d, reg++, &subb); 139 pci_read_config_byte(d, reg++, &subb);
140 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 140 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
141 pxb, busno, suba, subb);
141 if (busno) { 142 if (busno) {
142 /* Bus A */ 143 /* Bus A */
143 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); 144 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7dc5d5cf50a2..274d06082f48 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h>
14 15
15static struct saved_context saved_context; 16static struct saved_context saved_context;
16 17
@@ -45,7 +46,7 @@ static void __save_processor_state(struct saved_context *ctxt)
45 ctxt->cr0 = read_cr0(); 46 ctxt->cr0 = read_cr0();
46 ctxt->cr2 = read_cr2(); 47 ctxt->cr2 = read_cr2();
47 ctxt->cr3 = read_cr3(); 48 ctxt->cr3 = read_cr3();
48 ctxt->cr4 = read_cr4(); 49 ctxt->cr4 = read_cr4_safe();
49} 50}
50 51
51/* Needed by apm.c */ 52/* Needed by apm.c */
@@ -98,7 +99,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
98 /* 99 /*
99 * control registers 100 * control registers
100 */ 101 */
101 write_cr4(ctxt->cr4); 102 /* cr4 was introduced in the Pentium CPU */
103 if (ctxt->cr4)
104 write_cr4(ctxt->cr4);
102 write_cr3(ctxt->cr3); 105 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 106 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 107 write_cr0(ctxt->cr0);
@@ -124,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
124 if (boot_cpu_has(X86_FEATURE_SEP)) 127 if (boot_cpu_has(X86_FEATURE_SEP))
125 enable_sep_cpu(); 128 enable_sep_cpu();
126 129
130 /*
131 * restore XCR0 for xsave capable cpu's.
132 */
133 if (cpu_has_xsave)
134 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
135
127 fix_processor_context(); 136 fix_processor_context();
128 do_fpu_end(); 137 do_fpu_end();
129 mtrr_ap_init(); 138 mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd8..e3b6cf70d62c 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h>
17 18
18static void fix_processor_context(void); 19static void fix_processor_context(void);
19 20
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
122 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 123 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
123 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 124 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
124 125
126 /*
127 * restore XCR0 for xsave capable cpu's.
128 */
129 if (cpu_has_xsave)
130 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
131
125 fix_processor_context(); 132 fix_processor_context();
126 133
127 do_fpu_end(); 134 do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b95aa6cfe3cb..d1e9b53f9d33 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
1.text
2
3/* 1/*
4 * This may not use any stack, nor any variable that is not "NoSave": 2 * This may not use any stack, nor any variable that is not "NoSave":
5 * 3 *
@@ -12,25 +10,26 @@
12#include <asm/segment.h> 10#include <asm/segment.h>
13#include <asm/page.h> 11#include <asm/page.h>
14#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h>
15 14
16 .text 15.text
17 16
18ENTRY(swsusp_arch_suspend) 17ENTRY(swsusp_arch_suspend)
19
20 movl %esp, saved_context_esp 18 movl %esp, saved_context_esp
21 movl %ebx, saved_context_ebx 19 movl %ebx, saved_context_ebx
22 movl %ebp, saved_context_ebp 20 movl %ebp, saved_context_ebp
23 movl %esi, saved_context_esi 21 movl %esi, saved_context_esi
24 movl %edi, saved_context_edi 22 movl %edi, saved_context_edi
25 pushfl ; popl saved_context_eflags 23 pushfl
24 popl saved_context_eflags
26 25
27 call swsusp_save 26 call swsusp_save
28 ret 27 ret
29 28
30ENTRY(restore_image) 29ENTRY(restore_image)
31 movl resume_pg_dir, %ecx 30 movl resume_pg_dir, %eax
32 subl $__PAGE_OFFSET, %ecx 31 subl $__PAGE_OFFSET, %eax
33 movl %ecx, %cr3 32 movl %eax, %cr3
34 33
35 movl restore_pblist, %edx 34 movl restore_pblist, %edx
36 .p2align 4,,7 35 .p2align 4,,7
@@ -52,17 +51,21 @@ copy_loop:
52 51
53done: 52done:
54 /* go back to the original page tables */ 53 /* go back to the original page tables */
55 movl $swapper_pg_dir, %ecx 54 movl $swapper_pg_dir, %eax
56 subl $__PAGE_OFFSET, %ecx 55 subl $__PAGE_OFFSET, %eax
57 movl %ecx, %cr3 56 movl %eax, %cr3
58 /* Flush TLB, including "global" things (vmalloc) */ 57 /* Flush TLB, including "global" things (vmalloc) */
59 movl mmu_cr4_features, %eax 58 movl mmu_cr4_features, %ecx
60 movl %eax, %edx 59 jecxz 1f # cr4 Pentium and higher, skip if zero
61 andl $~(1<<7), %edx; # PGE 60 movl %ecx, %edx
61 andl $~(X86_CR4_PGE), %edx
62 movl %edx, %cr4; # turn off PGE 62 movl %edx, %cr4; # turn off PGE
63 movl %cr3, %ecx; # flush TLB 631:
64 movl %ecx, %cr3 64 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr4; # turn PGE back on 65 movl %eax, %cr3
66 jecxz 1f # cr4 Pentium and higher, skip if zero
67 movl %ecx, %cr4; # turn PGE back on
681:
66 69
67 movl saved_context_esp, %esp 70 movl saved_context_esp, %esp
68 movl saved_context_ebp, %ebp 71 movl saved_context_ebp, %ebp
@@ -70,7 +73,8 @@ done:
70 movl saved_context_esi, %esi 73 movl saved_context_esi, %esi
71 movl saved_context_edi, %edi 74 movl saved_context_edi, %edi
72 75
73 pushl saved_context_eflags ; popfl 76 pushl saved_context_eflags
77 popfl
74 78
75 xorl %eax, %eax 79 xorl %eax, %eax
76 80
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f470..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
26 26
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on XEN && PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed2..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9ff6e3cbf08f..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,12 +30,12 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
37 36
38#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
39#include <asm/page.h> 39#include <asm/page.h>
40#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
41#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
@@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
60/* 63/*
61 * Identity map, in addition to plain kernel map. This needs to be 64 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest. 65 * large enough to allocate page table pages to allocate the rest.
@@ -110,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
110 * 113 *
111 * 0: not available, 1: available 114 * 0: not available, 1: available
112 */ 115 */
113static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
114 124
115static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
116{ 126{
@@ -226,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
226 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
227} 237}
228 238
229static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
230{ 240{
231 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
232 unsigned long flags; 242 xen_mc_flush();
233
234 vcpu = x86_read_percpu(xen_vcpu);
235
236 /* flag has opposite sense of mask */
237 flags = !vcpu->evtchn_upcall_mask;
238
239 /* convert to IF type flag
240 -0 -> 0x00000000
241 -1 -> 0xffffffff
242 */
243 return (-flags) & X86_EFLAGS_IF;
244} 243}
245 244
246static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
247{ 246{
248 struct vcpu_info *vcpu; 247 return 0;
249
250 /* convert from IF type flag */
251 flags = !(flags & X86_EFLAGS_IF);
252
253 /* There's a one instruction preempt window here. We need to
254 make sure we're don't switch CPUs between getting the vcpu
255 pointer and updating the mask. */
256 preempt_disable();
257 vcpu = x86_read_percpu(xen_vcpu);
258 vcpu->evtchn_upcall_mask = flags;
259 preempt_enable_no_resched();
260
261 /* Doesn't matter if we get preempted here, because any
262 pending event will get dealt with anyway. */
263
264 if (flags == 0) {
265 preempt_check_resched();
266 barrier(); /* unmask then check (avoid races) */
267 if (unlikely(vcpu->evtchn_upcall_pending))
268 force_evtchn_callback();
269 }
270} 248}
271 249
272static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
273{ 257{
274 /* There's a one instruction preempt window here. We need to 258 int level;
275 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
276 pointer and updating the mask. */ 260 pte_t pte;
277 preempt_disable(); 261 unsigned long pfn;
278 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
279 preempt_enable_no_resched();
280}
281 263
282static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
283{ 265 BUG_ON(ptep == NULL);
284 struct vcpu_info *vcpu;
285 266
286 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
287 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
288 the caller is confused and is trying to re-enable interrupts
289 on an indeterminate processor. */
290 269
291 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
292 vcpu->evtchn_upcall_mask = 0;
293 271
294 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
295 pending event will get dealt with anyway. */ 273 BUG();
296 274
297 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
298 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
299 force_evtchn_callback();
300}
301 277
302static void xen_safe_halt(void) 278 if (av != v)
303{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
304 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
306 BUG(); 282 kmap_flush_unused();
307} 283}
308 284
309static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
310{ 286{
311 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
312 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
313 else
314 xen_safe_halt();
315}
316 289
317static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
318{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
319 paravirt_leave_lazy(paravirt_get_lazy_mode());
320 xen_mc_flush();
321} 292}
322 293
323static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
324{ 295{
325 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
326} 301}
327 302
328static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -425,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
426 const void *ptr) 401 const void *ptr)
427{ 402{
428 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
429 xmaddr_t mach_lp = virt_to_machine(lp);
430 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
431 405
432 preempt_disable(); 406 preempt_disable();
@@ -559,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
559} 533}
560 534
561static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
562 struct thread_struct *thread) 536 struct thread_struct *thread)
563{ 537{
564 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
565 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -580,16 +554,47 @@ static void xen_io_delay(void)
580} 554}
581 555
582#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
583static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
584{ 558{
585 return 0; 559 return 0;
586} 560}
587 561
588static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
589{ 563{
590 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
591 WARN_ON(1); 565 WARN_ON(1);
592} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
593#endif 598#endif
594 599
595static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -803,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
803 ret = -EFAULT; 808 ret = -EFAULT;
804 break; 809 break;
805#endif 810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
806 default: 824 default:
807 ret = native_write_msr_safe(msr, low, high); 825 ret = native_write_msr_safe(msr, low, high);
808 } 826 }
@@ -812,7 +830,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
812 830
813/* Early in boot, while setting up the initial pagetable, assume 831/* Early in boot, while setting up the initial pagetable, assume
814 everything is pinned. */ 832 everything is pinned. */
815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 833static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
816{ 834{
817#ifdef CONFIG_FLATMEM 835#ifdef CONFIG_FLATMEM
818 BUG_ON(mem_map); /* should only be used early */ 836 BUG_ON(mem_map); /* should only be used early */
@@ -822,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
822 840
823/* Early release_pte assumes that all pts are pinned, since there's 841/* Early release_pte assumes that all pts are pinned, since there's
824 only init_mm and anything attached to that is pinned. */ 842 only init_mm and anything attached to that is pinned. */
825static void xen_release_pte_init(u32 pfn) 843static void xen_release_pte_init(unsigned long pfn)
826{ 844{
827 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 845 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
828} 846}
@@ -838,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
838 856
839/* This needs to make sure the new pte page is pinned iff its being 857/* This needs to make sure the new pte page is pinned iff its being
840 attached to a pinned pagetable. */ 858 attached to a pinned pagetable. */
841static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 859static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
842{ 860{
843 struct page *page = pfn_to_page(pfn); 861 struct page *page = pfn_to_page(pfn);
844 862
@@ -846,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
846 SetPagePinned(page); 864 SetPagePinned(page);
847 865
848 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
849 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
850 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
851 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
852 } else 870 } else
853 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -856,12 +874,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
856 } 874 }
857} 875}
858 876
859static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 877static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
860{ 878{
861 xen_alloc_ptpage(mm, pfn, PT_PTE); 879 xen_alloc_ptpage(mm, pfn, PT_PTE);
862} 880}
863 881
864static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 882static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
865{ 883{
866 xen_alloc_ptpage(mm, pfn, PT_PMD); 884 xen_alloc_ptpage(mm, pfn, PT_PMD);
867} 885}
@@ -909,13 +927,13 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
909} 927}
910 928
911/* This should never happen until we're OK to use struct page */ 929/* This should never happen until we're OK to use struct page */
912static void xen_release_ptpage(u32 pfn, unsigned level) 930static void xen_release_ptpage(unsigned long pfn, unsigned level)
913{ 931{
914 struct page *page = pfn_to_page(pfn); 932 struct page *page = pfn_to_page(pfn);
915 933
916 if (PagePinned(page)) { 934 if (PagePinned(page)) {
917 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
918 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
920 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
921 } 939 }
@@ -923,23 +941,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
923 } 941 }
924} 942}
925 943
926static void xen_release_pte(u32 pfn) 944static void xen_release_pte(unsigned long pfn)
927{ 945{
928 xen_release_ptpage(pfn, PT_PTE); 946 xen_release_ptpage(pfn, PT_PTE);
929} 947}
930 948
931static void xen_release_pmd(u32 pfn) 949static void xen_release_pmd(unsigned long pfn)
932{ 950{
933 xen_release_ptpage(pfn, PT_PMD); 951 xen_release_ptpage(pfn, PT_PMD);
934} 952}
935 953
936#if PAGETABLE_LEVELS == 4 954#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) 955static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
938{ 956{
939 xen_alloc_ptpage(mm, pfn, PT_PUD); 957 xen_alloc_ptpage(mm, pfn, PT_PUD);
940} 958}
941 959
942static void xen_release_pud(u32 pfn) 960static void xen_release_pud(unsigned long pfn)
943{ 961{
944 xen_release_ptpage(pfn, PT_PUD); 962 xen_release_ptpage(pfn, PT_PUD);
945} 963}
@@ -962,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
962} 980}
963#endif 981#endif
964 982
983#ifdef CONFIG_X86_32
965static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
966{ 985{
967 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -980,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
980 999
981 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
982} 1001}
1002#endif
983 1003
984static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
985{ 1005{
@@ -1046,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void)
1046 1066
1047 /* xen_vcpu_setup managed to place the vcpu_info within the 1067 /* xen_vcpu_setup managed to place the vcpu_info within the
1048 percpu area for all cpus, so make use of it */ 1068 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
1050 if (have_vcpu_info_placement) { 1069 if (have_vcpu_info_placement) {
1051 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1070 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1052 1071
@@ -1056,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void)
1056 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1075 pv_irq_ops.irq_enable = xen_irq_enable_direct;
1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1076 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1058 } 1077 }
1059#endif
1060} 1078}
1061 1079
1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1080static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1077,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1077 goto patch_site 1095 goto patch_site
1078 1096
1079 switch (type) { 1097 switch (type) {
1080#ifdef CONFIG_X86_32
1081 SITE(pv_irq_ops, irq_enable); 1098 SITE(pv_irq_ops, irq_enable);
1082 SITE(pv_irq_ops, irq_disable); 1099 SITE(pv_irq_ops, irq_disable);
1083 SITE(pv_irq_ops, save_fl); 1100 SITE(pv_irq_ops, save_fl);
1084 SITE(pv_irq_ops, restore_fl); 1101 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
1086#undef SITE 1102#undef SITE
1087 1103
1088 patch_site: 1104 patch_site:
@@ -1220,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1220 .load_gs_index = xen_load_gs_index, 1236 .load_gs_index = xen_load_gs_index,
1221#endif 1237#endif
1222 1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1241
1223 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1224 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
1225 .store_tr = xen_store_tr, 1244 .store_tr = xen_store_tr,
@@ -1241,40 +1260,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1241 }, 1260 },
1242}; 1261};
1243 1262
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1262 .init_IRQ = __xen_init_IRQ,
1263 .save_fl = xen_save_fl,
1264 .restore_fl = xen_restore_fl,
1265 .irq_disable = xen_irq_disable,
1266 .irq_enable = xen_irq_enable,
1267 .safe_halt = xen_safe_halt,
1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1272};
1273
1274static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1275#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1276 .apic_write = xen_apic_write,
1277 .apic_read = xen_apic_read,
1278 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
1279 .setup_secondary_clock = paravirt_nop, 1266 .setup_secondary_clock = paravirt_nop,
1280 .startup_ipi_hook = paravirt_nop, 1267 .startup_ipi_hook = paravirt_nop,
@@ -1324,7 +1311,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1311 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1325 1312
1326 .pte_val = xen_pte_val, 1313 .pte_val = xen_pte_val,
1327 .pte_flags = native_pte_val, 1314 .pte_flags = native_pte_flags,
1328 .pgd_val = xen_pgd_val, 1315 .pgd_val = xen_pgd_val,
1329 1316
1330 .make_pte = xen_make_pte, 1317 .make_pte = xen_make_pte,
@@ -1413,7 +1400,7 @@ static void __init xen_reserve_top(void)
1413 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1414 top = pp.virt_start; 1401 top = pp.virt_start;
1415 1402
1416 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1417#endif /* CONFIG_X86_32 */ 1404#endif /* CONFIG_X86_32 */
1418} 1405}
1419 1406
@@ -1447,48 +1434,11 @@ static void *m2v(phys_addr_t maddr)
1447 return __ka(m2p(maddr)); 1434 return __ka(m2p(maddr));
1448} 1435}
1449 1436
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot) 1437static void set_page_prot(void *addr, pgprot_t prot)
1484{ 1438{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot); 1440 pte_t pte = pfn_pte(pfn, prot);
1487 1441
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG(); 1443 BUG();
1494} 1444}
@@ -1664,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void)
1664 if (!xen_start_info) 1614 if (!xen_start_info)
1665 return; 1615 return;
1666 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1668 1620
1669 xen_setup_features(); 1621 xen_setup_features();
@@ -1673,10 +1625,18 @@ asmlinkage void __init xen_start_kernel(void)
1673 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1674 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1675 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1676 pv_irq_ops = xen_irq_ops;
1677 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1678 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1679 1630
1631 xen_init_irq_ops();
1632
1633#ifdef CONFIG_X86_LOCAL_APIC
1634 /*
1635 * set up the basic apic ops.
1636 */
1637 apic_ops = &xen_basic_apic_ops;
1638#endif
1639
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1640 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1641 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1642 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1700,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void)
1700 1660
1701 /* Prevent unwanted bits from being set in PTEs. */ 1661 /* Prevent unwanted bits from being set in PTEs. */
1702 __supported_pte_mask &= ~_PAGE_GLOBAL; 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1703 if (!is_initial_xendomain()) 1663 if (!xen_initial_domain())
1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1705 1665
1706 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1735,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void)
1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1737 1697
1738 if (!is_initial_xendomain()) { 1698 if (!xen_initial_domain()) {
1739 add_preferred_console("xenboot", 0, NULL); 1699 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL); 1700 add_preferred_console("tty", 0, NULL);
1741 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
@@ -1743,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void)
1743 1703
1744 xen_raw_console_write("about to get started...\n"); 1704 xen_raw_console_write("about to get started...\n");
1745 1705
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1755 /* Start the world */ 1706 /* Start the world */
1756#ifdef CONFIG_X86_32 1707#ifdef CONFIG_X86_32
1757 i386_start_kernel(); 1708 i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 857
735 if (user_pgd) { 858 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 861 }
739 } 862 }
740#else /* CONFIG_X86_32 */ 863#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 864#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 865 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
744#endif 868#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 870#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 871 xen_mc_issue(0);
748} 872}
749 873
874static void xen_pgd_pin(struct mm_struct *mm)
875{
876 __xen_pgd_pin(mm, mm->pgd);
877}
878
750/* 879/*
751 * On save, we need to pin all pagetables to make sure they get their 880 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 882 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
755 */ 888 */
756void xen_mm_pin_all(void) 889void xen_mm_pin_all(void)
757{ 890{
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
762 895
763 list_for_each_entry(page, &pgd_list, lru) { 896 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 897 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 899 SetPageSavePinned(page);
767 } 900 }
768 } 901 }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 908 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 909 * the book-keeping now.
777 */ 910 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
779{ 913{
780 SetPagePinned(page); 914 SetPagePinned(page);
781 return 0; 915 return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 917
784void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
785{ 919{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 921}
788 922
789static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
790{ 925{
791 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
792 927
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 932 struct multicall_space mcs;
798 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
799 if (level == PT_PTE) { 941 if (level == PT_PTE) {
800 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
801 943
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 946 }
804 947
805 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 953
811 if (ptl) { 954 if (ptl) {
812 /* unlock when batch completed */ 955 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
814 } 957 }
815 } 958 }
816 959
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 961}
819 962
820/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 965{
823 xen_mc_batch(); 966 xen_mc_batch();
824 967
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 973
831 if (user_pgd) { 974 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 977 }
835 } 978 }
836#endif 979#endif
837 980
838#ifdef CONFIG_X86_PAE 981#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 982 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
841#endif 985#endif
842 986
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 988
845 xen_mc_issue(0); 989 xen_mc_issue(0);
846} 990}
847 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
848/* 997/*
849 * On resume, undo any pinning done at save, so that the rest of the 998 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 999 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1008 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1009 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1010 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1012 ClearPageSavePinned(page);
864 } 1013 }
865 } 1014 }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1020{
872 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
875} 1024}
876 1025
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1027{
879 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
882} 1031}
883 1032
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
907 } 1056 }
908} 1057}
909 1058
910static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1060{
912 cpumask_t mask; 1061 cpumask_t mask;
913 unsigned cpu; 1062 unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1087}
939#else 1088#else
940static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1090{
942 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
962{ 1111{
963 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
965 put_cpu(); 1114 put_cpu();
966 1115
967 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
968 1117
969 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1119 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
972 1121
973 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
974} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e3..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
19void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
20 20
21void xen_pgd_pin(pgd_t *pgd);
22//void xen_pgd_unpin(pgd_t *pgd);
23
24pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c9776..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b6acc3a0af46..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void)
42 42
43 e820.nr_map = 0; 43 e820.nr_map = 0;
44 44
45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46 46
47 /* 47 /*
48 * Even though this is normal, usable memory under Xen, reserve 48 * Even though this is normal, usable memory under Xen, reserve
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1d..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb3..05794c566e87 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */ 26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000 27#define XEN_EFLAGS_NMI 0x80000000
28 28
29#if 0 29#if 1
30#include <asm/percpu.h> 30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
31 38
32/* 39/*
33 Enable events. This clears the event mask and tests the pending 40 Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
35 events, then enter the hypervisor to get them handled. 42 events, then enter the hypervisor to get them handled.
36 */ 43 */
37ENTRY(xen_irq_enable_direct) 44ENTRY(xen_irq_enable_direct)
45 BUG
46
38 /* Unmask events */ 47 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40 49
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
58 non-zero. 67 non-zero.
59 */ 68 */
60ENTRY(xen_irq_disable_direct) 69ENTRY(xen_irq_disable_direct)
70 BUG
71
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct) 73ENDPATCH(xen_irq_disable_direct)
63 ret 74 ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
74 Xen and x86 use opposite senses (mask vs enable). 85 Xen and x86 use opposite senses (mask vs enable).
75 */ 86 */
76ENTRY(xen_save_fl_direct) 87ENTRY(xen_save_fl_direct)
88 BUG
89
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah 91 setz %ah
79 addb %ah,%ah 92 addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
91 if so. 104 if so.
92 */ 105 */
93ENTRY(xen_restore_fl_direct) 106ENTRY(xen_restore_fl_direct)
107 BUG
108
94 testb $X86_EFLAGS_IF>>8, %ah 109 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with 111 /* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
122 push %r9 137 push %r9
123 push %r10 138 push %r10
124 push %r11 139 push %r11
125 call force_evtchn_callback 140 call xen_force_evtchn_callback
126 pop %r11 141 pop %r11
127 pop %r10 142 pop %r10
128 pop %r9 143 pop %r9
@@ -133,7 +148,6 @@ check_events:
133 pop %rcx 148 pop %rcx
134 pop %rax 149 pop %rax
135 ret 150 ret
136#endif
137 151
138ENTRY(xen_adjust_exception_frame) 152ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx 153 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2e..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}