aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-12 15:04:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-12 15:05:14 -0400
commit807f4f8cdd5b65a8a5fcfda266c074f6a23818dd (patch)
tree395afdf45badd02d03871c827b8baa850cbe5841 /arch
parent1a2217a9516b134e0a0e54cb4629e1e075d97b17 (diff)
parent8daf14cf56816303d64d1a705fcbc389211ba36e (diff)
Merge branch 'x86-core-v2-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
This merges in: x86/build, x86/microcode, x86/spinlocks, x86/memory-corruption-check, x86/early-printk, x86/xsave, x86/quirks, x86/setup, x86/signal, core/signal, x86/urgent, x86/xen * 'x86-core-v2-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (142 commits) x86: make processor type select depend on CONFIG_EMBEDDED x86: extend processor type select help text x86, amd-iommu: propagate PCI device enabling error warnings: fix arch/x86/kernel/io_apic_64.c warnings: fix arch/x86/kernel/early_printk.c x86, fpu: check __clear_user() return value x86: memory corruption check - cleanup x86: ioperm user_regset xen: do not reserve 2 pages of padding between hypervisor and fixmap. xen: use spin_lock_nest_lock when pinning a pagetable x86: xsave: set FP, SSE bits in the xsave header in the user sigcontext x86: xsave: fix error condition in save_i387_xstate() x86: SB450: deprioritize DMI quirks x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC x86: replace a magic number with a named constant in the VESA boot code x86 setup: remove IMAGE_OFFSET x86 setup: remove DEF_INITSEG and DEF_SETUPSEG Revert "x86: fix ghost EDD devices in /sys again" x86 setup: fix ghost entries under /sys/firmware/edd take 3 x86: signal: remove indent in restore_sigcontext() ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/siginfo.h5
-rw-r--r--arch/powerpc/include/asm/siginfo.h5
-rw-r--r--arch/x86/Kconfig90
-rw-r--r--arch/x86/Kconfig.cpu64
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile_32.cpu5
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/edd.c7
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/ia32/ia32_signal.c68
-rw-r--r--arch/x86/kernel/Makefile10
-rw-r--r--arch/x86/kernel/acpi/boot.c12
-rw-r--r--arch/x86/kernel/amd_iommu_init.c4
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/early-quirks.c48
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/i387.c14
-rw-r--r--arch/x86/kernel/io_apic_64.c4
-rw-r--r--arch/x86/kernel/ldt.c9
-rw-r--r--arch/x86/kernel/microcode.c853
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c27
-rw-r--r--arch/x86/kernel/process_32.c39
-rw-r--r--arch/x86/kernel/process_64.c22
-rw-r--r--arch/x86/kernel/ptrace.c44
-rw-r--r--arch/x86/kernel/setup.c196
-rw-r--r--arch/x86/kernel/signal_32.c222
-rw-r--r--arch/x86/kernel/signal_64.c207
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/traps_32.c4
-rw-r--r--arch/x86/kernel/traps_64.c2
-rw-r--r--arch/x86/kernel/xsave.c33
-rw-r--r--arch/x86/mm/fault.c14
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/ioremap.c33
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c252
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/mmu.c314
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
60 files changed, 4418 insertions, 1672 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h
index 9294e4b0c8bc..118d42979003 100644
--- a/arch/ia64/include/asm/siginfo.h
+++ b/arch/ia64/include/asm/siginfo.h
@@ -113,11 +113,6 @@ typedef struct siginfo {
113#undef NSIGSEGV 113#undef NSIGSEGV
114#define NSIGSEGV 3 114#define NSIGSEGV 3
115 115
116/*
117 * SIGTRAP si_codes
118 */
119#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */
120#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */
121#undef NSIGTRAP 116#undef NSIGTRAP
122#define NSIGTRAP 4 117#define NSIGTRAP 4
123 118
diff --git a/arch/powerpc/include/asm/siginfo.h b/arch/powerpc/include/asm/siginfo.h
index 12f1bce037be..49495b0534ed 100644
--- a/arch/powerpc/include/asm/siginfo.h
+++ b/arch/powerpc/include/asm/siginfo.h
@@ -15,11 +15,6 @@
15 15
16#include <asm-generic/siginfo.h> 16#include <asm-generic/siginfo.h>
17 17
18/*
19 * SIGTRAP si_codes
20 */
21#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */
22#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint or watchpoint */
23#undef NSIGTRAP 18#undef NSIGTRAP
24#define NSIGTRAP 4 19#define NSIGTRAP 4
25 20
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 44d4f2130d01..fc8351f374fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -778,23 +778,45 @@ config X86_REBOOTFIXUPS
778 Say N otherwise. 778 Say N otherwise.
779 779
780config MICROCODE 780config MICROCODE
781 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 781 tristate "/dev/cpu/microcode - microcode support"
782 select FW_LOADER 782 select FW_LOADER
783 ---help--- 783 ---help---
784 If you say Y here, you will be able to update the microcode on 784 If you say Y here, you will be able to update the microcode on
785 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 785 certain Intel and AMD processors. The Intel support is for the
786 Pentium III, Pentium 4, Xeon etc. You will obviously need the 786 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
787 actual microcode binary data itself which is not shipped with the 787 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
788 Linux kernel. 788 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
789 You will obviously need the actual microcode binary data itself
790 which is not shipped with the Linux kernel.
789 791
790 For latest news and information on obtaining all the required 792 This option selects the general module only, you need to select
791 ingredients for this driver, check: 793 at least one vendor specific module as well.
792 <http://www.urbanmyth.org/microcode/>.
793 794
794 To compile this driver as a module, choose M here: the 795 To compile this driver as a module, choose M here: the
795 module will be called microcode. 796 module will be called microcode.
796 797
797config MICROCODE_OLD_INTERFACE 798config MICROCODE_INTEL
799 bool "Intel microcode patch loading support"
800 depends on MICROCODE
801 default MICROCODE
802 select FW_LOADER
803 --help---
804 This options enables microcode patch loading support for Intel
805 processors.
806
807 For latest news and information on obtaining all the required
808 Intel ingredients for this driver, check:
809 <http://www.urbanmyth.org/microcode/>.
810
811config MICROCODE_AMD
812 bool "AMD microcode patch loading support"
813 depends on MICROCODE
814 select FW_LOADER
815 --help---
816 If you select this option, microcode patch loading support for AMD
817 processors will be enabled.
818
819 config MICROCODE_OLD_INTERFACE
798 def_bool y 820 def_bool y
799 depends on MICROCODE 821 depends on MICROCODE
800 822
@@ -1061,6 +1083,56 @@ config HIGHPTE
1061 low memory. Setting this option will put user-space page table 1083 low memory. Setting this option will put user-space page table
1062 entries in high memory. 1084 entries in high memory.
1063 1085
1086config X86_CHECK_BIOS_CORRUPTION
1087 bool "Check for low memory corruption"
1088 help
1089 Periodically check for memory corruption in low memory, which
1090 is suspected to be caused by BIOS. Even when enabled in the
1091 configuration, it is disabled at runtime. Enable it by
1092 setting "memory_corruption_check=1" on the kernel command
1093 line. By default it scans the low 64k of memory every 60
1094 seconds; see the memory_corruption_check_size and
1095 memory_corruption_check_period parameters in
1096 Documentation/kernel-parameters.txt to adjust this.
1097
1098 When enabled with the default parameters, this option has
1099 almost no overhead, as it reserves a relatively small amount
1100 of memory and scans it infrequently. It both detects corruption
1101 and prevents it from affecting the running system.
1102
1103 It is, however, intended as a diagnostic tool; if repeatable
1104 BIOS-originated corruption always affects the same memory,
1105 you can use memmap= to prevent the kernel from using that
1106 memory.
1107
1108config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1109 bool "Set the default setting of memory_corruption_check"
1110 depends on X86_CHECK_BIOS_CORRUPTION
1111 default y
1112 help
1113 Set whether the default state of memory_corruption_check is
1114 on or off.
1115
1116config X86_RESERVE_LOW_64K
1117 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1118 default y
1119 help
1120 Reserve the first 64K of physical RAM on BIOSes that are known
1121 to potentially corrupt that memory range. A numbers of BIOSes are
1122 known to utilize this area during suspend/resume, so it must not
1123 be used by the kernel.
1124
1125 Set this to N if you are absolutely sure that you trust the BIOS
1126 to get all its memory reservations and usages right.
1127
1128 If you have doubts about the BIOS (e.g. suspend/resume does not
1129 work or there's kernel crashes after certain hardware hotplug
1130 events) and it's not AMI or Phoenix, then you might want to enable
1131 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1132 corruption patterns.
1133
1134 Say Y if unsure.
1135
1064config MATH_EMULATION 1136config MATH_EMULATION
1065 bool 1137 bool
1066 prompt "Math emulation" if X86_32 1138 prompt "Math emulation" if X86_32
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f8843c3ae77d..c5f101360520 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -420,7 +420,6 @@ config X86_DEBUGCTLMSR
420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
421 421
422menuconfig PROCESSOR_SELECT 422menuconfig PROCESSOR_SELECT
423 default y
424 bool "Supported processor vendors" if EMBEDDED 423 bool "Supported processor vendors" if EMBEDDED
425 help 424 help
426 This lets you choose what x86 vendor support code your kernel 425 This lets you choose what x86 vendor support code your kernel
@@ -430,48 +429,97 @@ config CPU_SUP_INTEL
430 default y 429 default y
431 bool "Support Intel processors" if PROCESSOR_SELECT 430 bool "Support Intel processors" if PROCESSOR_SELECT
432 help 431 help
433 This enables extended support for Intel processors 432 This enables detection, tunings and quirks for Intel processors
433
434 You need this enabled if you want your kernel to run on an
435 Intel CPU. Disabling this option on other types of CPUs
436 makes the kernel a tiny bit smaller. Disabling it on an Intel
437 CPU might render the kernel unbootable.
438
439 If unsure, say N.
434 440
435config CPU_SUP_CYRIX_32 441config CPU_SUP_CYRIX_32
436 default y 442 default y
437 bool "Support Cyrix processors" if PROCESSOR_SELECT 443 bool "Support Cyrix processors" if PROCESSOR_SELECT
438 depends on !64BIT 444 depends on !64BIT
439 help 445 help
440 This enables extended support for Cyrix processors 446 This enables detection, tunings and quirks for Cyrix processors
447
448 You need this enabled if you want your kernel to run on a
449 Cyrix CPU. Disabling this option on other types of CPUs
450 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
451 CPU might render the kernel unbootable.
452
453 If unsure, say N.
441 454
442config CPU_SUP_AMD 455config CPU_SUP_AMD
443 default y 456 default y
444 bool "Support AMD processors" if PROCESSOR_SELECT 457 bool "Support AMD processors" if PROCESSOR_SELECT
445 help 458 help
446 This enables extended support for AMD processors 459 This enables detection, tunings and quirks for AMD processors
460
461 You need this enabled if you want your kernel to run on an
462 AMD CPU. Disabling this option on other types of CPUs
463 makes the kernel a tiny bit smaller. Disabling it on an AMD
464 CPU might render the kernel unbootable.
465
466 If unsure, say N.
447 467
448config CPU_SUP_CENTAUR_32 468config CPU_SUP_CENTAUR_32
449 default y 469 default y
450 bool "Support Centaur processors" if PROCESSOR_SELECT 470 bool "Support Centaur processors" if PROCESSOR_SELECT
451 depends on !64BIT 471 depends on !64BIT
452 help 472 help
453 This enables extended support for Centaur processors 473 This enables detection, tunings and quirks for Centaur processors
474
475 You need this enabled if you want your kernel to run on a
476 Centaur CPU. Disabling this option on other types of CPUs
477 makes the kernel a tiny bit smaller. Disabling it on a Centaur
478 CPU might render the kernel unbootable.
479
480 If unsure, say N.
454 481
455config CPU_SUP_CENTAUR_64 482config CPU_SUP_CENTAUR_64
456 default y 483 default y
457 bool "Support Centaur processors" if PROCESSOR_SELECT 484 bool "Support Centaur processors" if PROCESSOR_SELECT
458 depends on 64BIT 485 depends on 64BIT
459 help 486 help
460 This enables extended support for Centaur processors 487 This enables detection, tunings and quirks for Centaur processors
488
489 You need this enabled if you want your kernel to run on a
490 Centaur CPU. Disabling this option on other types of CPUs
491 makes the kernel a tiny bit smaller. Disabling it on a Centaur
492 CPU might render the kernel unbootable.
493
494 If unsure, say N.
461 495
462config CPU_SUP_TRANSMETA_32 496config CPU_SUP_TRANSMETA_32
463 default y 497 default y
464 bool "Support Transmeta processors" if PROCESSOR_SELECT 498 bool "Support Transmeta processors" if PROCESSOR_SELECT
465 depends on !64BIT 499 depends on !64BIT
466 help 500 help
467 This enables extended support for Transmeta processors 501 This enables detection, tunings and quirks for Transmeta processors
502
503 You need this enabled if you want your kernel to run on a
504 Transmeta CPU. Disabling this option on other types of CPUs
505 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
506 CPU might render the kernel unbootable.
507
508 If unsure, say N.
468 509
469config CPU_SUP_UMC_32 510config CPU_SUP_UMC_32
470 default y 511 default y
471 bool "Support UMC processors" if PROCESSOR_SELECT 512 bool "Support UMC processors" if PROCESSOR_SELECT
472 depends on !64BIT 513 depends on !64BIT
473 help 514 help
474 This enables extended support for UMC processors 515 This enables detection, tunings and quirks for UMC processors
516
517 You need this enabled if you want your kernel to run on a
518 UMC CPU. Disabling this option on other types of CPUs
519 makes the kernel a tiny bit smaller. Disabling it on a UMC
520 CPU might render the kernel unbootable.
521
522 If unsure, say N.
475 523
476config X86_DS 524config X86_DS
477 bool "Debug Store support" 525 bool "Debug Store support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 092f019e033a..2a3dfbd5e677 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,19 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
46config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
47 bool "Check for stack overflows" 60 bool "Check for stack overflows"
48 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e919..b72b4f753113 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -45,3 +45,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 45# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 47
48# Bug fix for binutils: this option is required in order to keep
49# binutils from generating NOPL instructions against our will.
50ifneq ($(CONFIG_X86_P6_NOP),y)
51cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
52endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f8..cd48c7210016 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93e..1771c804e02f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index d93cbc6464d0..1aae8f3e5ca1 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad08..1e6fe0214c85 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -224,7 +224,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 224static void vesa_store_mode_params_graphics(void)
225{ 225{
226 /* Tell the kernel we're in VESA graphics mode */ 226 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 227 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 228
229 /* Mode parameters */ 229 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ef9a52005ec9..ca226ca31288 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1535,7 +1535,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1535CONFIG_VGA_CONSOLE=y 1535CONFIG_VGA_CONSOLE=y
1536CONFIG_VGACON_SOFT_SCROLLBACK=y 1536CONFIG_VGACON_SOFT_SCROLLBACK=y
1537CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1537CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1538CONFIG_VIDEO_SELECT=y
1539CONFIG_DUMMY_CONSOLE=y 1538CONFIG_DUMMY_CONSOLE=y
1540# CONFIG_FRAMEBUFFER_CONSOLE is not set 1539# CONFIG_FRAMEBUFFER_CONSOLE is not set
1541CONFIG_LOGO=y 1540CONFIG_LOGO=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e620ea6e2a7a..2c4b1c771e28 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1505,7 +1505,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1505CONFIG_VGA_CONSOLE=y 1505CONFIG_VGA_CONSOLE=y
1506CONFIG_VGACON_SOFT_SCROLLBACK=y 1506CONFIG_VGACON_SOFT_SCROLLBACK=y
1507CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1507CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1508CONFIG_VIDEO_SELECT=y
1509CONFIG_DUMMY_CONSOLE=y 1508CONFIG_DUMMY_CONSOLE=y
1510# CONFIG_FRAMEBUFFER_CONSOLE is not set 1509# CONFIG_FRAMEBUFFER_CONSOLE is not set
1511CONFIG_LOGO=y 1510CONFIG_LOGO=y
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 8d64c1bc8474..4bc02b23674b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -351,31 +351,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
351 savesegment(es, tmp); 351 savesegment(es, tmp);
352 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
353 353
354 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
355 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
356 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
357 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
358 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
359 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
360 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
361 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
362 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
363 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
364 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
365 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
366 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
367 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
368 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
369 369
370 tmp = save_i387_xstate_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
371 if (tmp < 0) 371 if (tmp < 0)
372 err = -EFAULT; 372 err = -EFAULT;
373 else { 373 else
374 clear_used_math();
375 stts();
376 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
377 &sc->fpstate); 375 &sc->fpstate);
378 }
379 376
380 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
381 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -444,21 +441,18 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
444 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
445 442
446 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
447 goto give_sigsegv; 444 return -EFAULT;
448 445
449 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
450 if (err) 447 return -EFAULT;
451 goto give_sigsegv;
452 448
453 err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
454 if (err) 450 return -EFAULT;
455 goto give_sigsegv;
456 451
457 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
458 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
459 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
460 if (err) 455 return -EFAULT;
461 goto give_sigsegv;
462 } 456 }
463 457
464 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -479,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
479 */ 473 */
480 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
481 if (err) 475 if (err)
482 goto give_sigsegv; 476 return -EFAULT;
483 477
484 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
485 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -502,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
502#endif 496#endif
503 497
504 return 0; 498 return 0;
505
506give_sigsegv:
507 force_sigsegv(sig, current);
508 return -EFAULT;
509} 499}
510 500
511int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -533,14 +523,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
533 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
534 524
535 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
536 goto give_sigsegv; 526 return -EFAULT;
537 527
538 err |= __put_user(sig, &frame->sig); 528 err |= __put_user(sig, &frame->sig);
539 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
540 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
541 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
542 if (err) 532 if (err)
543 goto give_sigsegv; 533 return -EFAULT;
544 534
545 /* Create the ucontext. */ 535 /* Create the ucontext. */
546 if (cpu_has_xsave) 536 if (cpu_has_xsave)
@@ -556,7 +546,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
556 regs, set->sig[0]); 546 regs, set->sig[0]);
557 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
558 if (err) 548 if (err)
559 goto give_sigsegv; 549 return -EFAULT;
560 550
561 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
562 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -571,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
571 */ 561 */
572 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
573 if (err) 563 if (err)
574 goto give_sigsegv; 564 return -EFAULT;
575 565
576 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
577 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -599,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
599#endif 589#endif
600 590
601 return 0; 591 return 0;
602
603give_sigsegv:
604 force_sigsegv(sig, current);
605 return -EFAULT;
606} 592}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c9be69fedb70..5098585f87ce 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
51obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
52obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
53obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
54obj-$(CONFIG_MICROCODE) += microcode.o
55obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
56apm-y := apm_32.o 55apm-y := apm_32.o
57obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
91obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
95 94
96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -100,6 +99,11 @@ scx200-y += scx200_32.o
100 99
101obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
102 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
103### 107###
104# 64 bit specific files 108# 64 bit specific files
105ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c2ac1b4515a0..eb875cdc7367 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1418,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1418 */ 1418 */
1419static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1419static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1420{ 1420{
1421 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1421 /*
1422 acpi_skip_timer_override = 1; 1422 * The ati_ixp4x0_rev() early PCI quirk should have set
1423 * the acpi_skip_timer_override flag already:
1424 */
1425 if (!acpi_skip_timer_override) {
1426 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1427 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1428 d->ident);
1429 acpi_skip_timer_override = 1;
1430 }
1423 return 0; 1431 return 0;
1424} 1432}
1425 1433
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 148fcfe22f17..4cd8083c58be 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -723,9 +723,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
723 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
724 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
725 725
726 pci_enable_device(iommu->dev); 726 return pci_enable_device(iommu->dev);
727
728 return 0;
729} 727}
730 728
731/* 729/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7581b62df184..fb789dd9e691 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,16 +1121,5 @@ void __cpuinit cpu_init(void)
1121 xsave_init(); 1121 xsave_init();
1122} 1122}
1123 1123
1124#ifdef CONFIG_HOTPLUG_CPU
1125void __cpuinit cpu_uninit(void)
1126{
1127 int cpu = raw_smp_processor_id();
1128 cpu_clear(cpu, cpu_initialized);
1129
1130 /* lazy TLB state */
1131 per_cpu(cpu_tlbstate, cpu).state = 0;
1132 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
1133}
1134#endif
1135 1124
1136#endif 1125#endif
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..395acb12b0d1 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir)
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 24bb5faf5efa..733c4f8d42ea 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,52 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98static u32 ati_ixp4x0_rev(int num, int slot, int func)
99{
100 u32 d;
101 u8 b;
102
103 b = read_pci_config_byte(num, slot, func, 0xac);
104 b &= ~(1<<5);
105 write_pci_config_byte(num, slot, func, 0xac, b);
106
107 d = read_pci_config(num, slot, func, 0x70);
108 d |= 1<<8;
109 write_pci_config(num, slot, func, 0x70, d);
110
111 d = read_pci_config(num, slot, func, 0x8);
112 d &= 0xff;
113 return d;
114}
115
116static void __init ati_bugs(int num, int slot, int func)
117{
118#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141#endif
142}
143
98#ifdef CONFIG_DMAR 144#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func) 145static void __init intel_g33_dmar(int num, int slot, int func)
100{ 146{
@@ -128,6 +174,8 @@ static struct chipset early_qrk[] __initdata = {
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 174 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 175 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 176 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
177 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
178 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
131#ifdef CONFIG_DMAR 179#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0, 180 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, 181 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 45723f1fe198..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -468,9 +468,23 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
468 468
469static int save_i387_xsave(void __user *buf) 469static int save_i387_xsave(void __user *buf)
470{ 470{
471 struct task_struct *tsk = current;
471 struct _fpstate_ia32 __user *fx = buf; 472 struct _fpstate_ia32 __user *fx = buf;
472 int err = 0; 473 int err = 0;
473 474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
474 if (save_i387_fxsave(fx) < 0) 488 if (save_i387_fxsave(fx) < 0)
475 return -1; 489 return -1;
476 490
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index a1bec2969c6a..02063ae042f7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1281,8 +1281,8 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1282 1282
1283 icr = apic_icr_read(); 1283 icr = apic_icr_read();
1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); 1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1285 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); 1285 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1286 1286
1287 v = apic_read(APIC_LVTT); 1287 v = apic_read(APIC_LVTT);
1288 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1288 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0ed5f939b905..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -52,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
53 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
54 54
55 paravirt_alloc_ldt(newldt, mincount);
56
55#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
56 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
57 wmb(); 59 wmb();
@@ -74,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
74#endif 76#endif
75 } 77 }
76 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
77 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
78 vfree(oldldt); 81 vfree(oldldt);
79 else 82 else
@@ -85,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
85static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
86{ 89{
87 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
88 92
89 if (err < 0) 93 if (err < 0)
90 return err; 94 return err;
91 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
92 return 0; 98 return 0;
93} 99}
94 100
@@ -125,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
125 if (mm == current->active_mm) 131 if (mm == current->active_mm)
126 clear_LDT(); 132 clear_LDT();
127#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
128 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
129 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
130 else 137 else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 652fa5c38ebe..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,853 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed_ptr(current, &old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 cycle_kernel_lock();
428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
429}
430
431static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
432{
433 ssize_t ret;
434
435 if ((len >> PAGE_SHIFT) > num_physpages) {
436 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
437 return -EINVAL;
438 }
439
440 get_online_cpus();
441 mutex_lock(&microcode_mutex);
442
443 user_buffer = (void __user *) buf;
444 user_buffer_size = (int) len;
445
446 ret = do_microcode_update();
447 if (!ret)
448 ret = (ssize_t)len;
449
450 mutex_unlock(&microcode_mutex);
451 put_online_cpus();
452
453 return ret;
454}
455
456static const struct file_operations microcode_fops = {
457 .owner = THIS_MODULE,
458 .write = microcode_write,
459 .open = microcode_open,
460};
461
462static struct miscdevice microcode_dev = {
463 .minor = MICROCODE_MINOR,
464 .name = "microcode",
465 .fops = &microcode_fops,
466};
467
468static int __init microcode_dev_init (void)
469{
470 int error;
471
472 error = misc_register(&microcode_dev);
473 if (error) {
474 printk(KERN_ERR
475 "microcode: can't misc_register on minor=%d\n",
476 MICROCODE_MINOR);
477 return error;
478 }
479
480 return 0;
481}
482
483static void microcode_dev_exit (void)
484{
485 misc_deregister(&microcode_dev);
486}
487
488MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#else
490#define microcode_dev_init() 0
491#define microcode_dev_exit() do { } while(0)
492#endif
493
494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
495 unsigned long size, long offset)
496{
497 microcode_header_t *mc_header;
498 unsigned long total_size;
499
500 /* No more data */
501 if (offset >= size)
502 return 0;
503 mc_header = (microcode_header_t *)(buf + offset);
504 total_size = get_totalsize(mc_header);
505
506 if (offset + total_size > size) {
507 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
508 return -EINVAL;
509 }
510
511 *mc = vmalloc(total_size);
512 if (!*mc) {
513 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
514 return -ENOMEM;
515 }
516 memcpy(*mc, buf + offset, total_size);
517 return offset + total_size;
518}
519
520/* fake device for request_firmware */
521static struct platform_device *microcode_pdev;
522
523static int cpu_request_microcode(int cpu)
524{
525 char name[30];
526 struct cpuinfo_x86 *c = &cpu_data(cpu);
527 const struct firmware *firmware;
528 const u8 *buf;
529 unsigned long size;
530 long offset = 0;
531 int error;
532 void *mc;
533
534 /* We should bind the task to the CPU */
535 BUG_ON(cpu != raw_smp_processor_id());
536 sprintf(name,"intel-ucode/%02x-%02x-%02x",
537 c->x86, c->x86_model, c->x86_mask);
538 error = request_firmware(&firmware, name, &microcode_pdev->dev);
539 if (error) {
540 pr_debug("microcode: data file %s load failed\n", name);
541 return error;
542 }
543 buf = firmware->data;
544 size = firmware->size;
545 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
546 > 0) {
547 error = microcode_sanity_check(mc);
548 if (error)
549 break;
550 error = get_maching_microcode(mc, cpu);
551 if (error < 0)
552 break;
553 /*
554 * It's possible the data file has multiple matching ucode,
555 * lets keep searching till the latest version
556 */
557 if (error == 1) {
558 apply_microcode(cpu);
559 error = 0;
560 }
561 vfree(mc);
562 }
563 if (offset > 0)
564 vfree(mc);
565 if (offset < 0)
566 error = offset;
567 release_firmware(firmware);
568
569 return error;
570}
571
572static int apply_microcode_check_cpu(int cpu)
573{
574 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old;
577 unsigned int val[2];
578 int err = 0;
579
580 /* Check if the microcode is available */
581 if (!uci->mc)
582 return 0;
583
584 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
586
587 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
589 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
590 err = -EINVAL;
591
592 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
593 /* get processor flags from MSR 0x17 */
594 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
595 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
596 err = -EINVAL;
597 }
598
599 if (!err) {
600 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
601 /* see notes above for revision 1.07. Apparent chip bug */
602 sync_core();
603 /* get the current revision from MSR 0x8B */
604 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
605 if (uci->rev != val[1])
606 err = -EINVAL;
607 }
608
609 if (!err)
610 apply_microcode(cpu);
611 else
612 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
613 " sig=0x%x, pf=0x%x, rev=0x%x\n",
614 cpu, uci->sig, uci->pf, uci->rev);
615
616 set_cpus_allowed_ptr(current, &old);
617 return err;
618}
619
620static void microcode_init_cpu(int cpu, int resume)
621{
622 cpumask_t old;
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624
625 old = current->cpus_allowed;
626
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
628 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
631 cpu_request_microcode(cpu);
632 mutex_unlock(&microcode_mutex);
633 set_cpus_allowed_ptr(current, &old);
634}
635
636static void microcode_fini_cpu(int cpu)
637{
638 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
639
640 mutex_lock(&microcode_mutex);
641 uci->valid = 0;
642 vfree(uci->mc);
643 uci->mc = NULL;
644 mutex_unlock(&microcode_mutex);
645}
646
647static ssize_t reload_store(struct sys_device *dev,
648 struct sysdev_attribute *attr,
649 const char *buf, size_t sz)
650{
651 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
652 char *end;
653 unsigned long val = simple_strtoul(buf, &end, 0);
654 int err = 0;
655 int cpu = dev->id;
656
657 if (end == buf)
658 return -EINVAL;
659 if (val == 1) {
660 cpumask_t old = current->cpus_allowed;
661
662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
664
665 mutex_lock(&microcode_mutex);
666 if (uci->valid)
667 err = cpu_request_microcode(cpu);
668 mutex_unlock(&microcode_mutex);
669 put_online_cpus();
670 set_cpus_allowed_ptr(current, &old);
671 }
672 if (err)
673 return err;
674 return sz;
675}
676
677static ssize_t version_show(struct sys_device *dev,
678 struct sysdev_attribute *attr, char *buf)
679{
680 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
681
682 return sprintf(buf, "0x%x\n", uci->rev);
683}
684
685static ssize_t pf_show(struct sys_device *dev,
686 struct sysdev_attribute *attr, char *buf)
687{
688 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
689
690 return sprintf(buf, "0x%x\n", uci->pf);
691}
692
693static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
694static SYSDEV_ATTR(version, 0400, version_show, NULL);
695static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
696
697static struct attribute *mc_default_attrs[] = {
698 &attr_reload.attr,
699 &attr_version.attr,
700 &attr_processor_flags.attr,
701 NULL
702};
703
704static struct attribute_group mc_attr_group = {
705 .attrs = mc_default_attrs,
706 .name = "microcode",
707};
708
709static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
710{
711 int err, cpu = sys_dev->id;
712 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
713
714 if (!cpu_online(cpu))
715 return 0;
716
717 pr_debug("microcode: CPU%d added\n", cpu);
718 memset(uci, 0, sizeof(*uci));
719
720 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
721 if (err)
722 return err;
723
724 microcode_init_cpu(cpu, resume);
725
726 return 0;
727}
728
729static int mc_sysdev_add(struct sys_device *sys_dev)
730{
731 return __mc_sysdev_add(sys_dev, 0);
732}
733
734static int mc_sysdev_remove(struct sys_device *sys_dev)
735{
736 int cpu = sys_dev->id;
737
738 if (!cpu_online(cpu))
739 return 0;
740
741 pr_debug("microcode: CPU%d removed\n", cpu);
742 microcode_fini_cpu(cpu);
743 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
744 return 0;
745}
746
747static int mc_sysdev_resume(struct sys_device *dev)
748{
749 int cpu = dev->id;
750
751 if (!cpu_online(cpu))
752 return 0;
753 pr_debug("microcode: CPU%d resumed\n", cpu);
754 /* only CPU 0 will apply ucode here */
755 apply_microcode(0);
756 return 0;
757}
758
759static struct sysdev_driver mc_sysdev_driver = {
760 .add = mc_sysdev_add,
761 .remove = mc_sysdev_remove,
762 .resume = mc_sysdev_resume,
763};
764
765static __cpuinit int
766mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
767{
768 unsigned int cpu = (unsigned long)hcpu;
769 struct sys_device *sys_dev;
770
771 sys_dev = get_cpu_sysdev(cpu);
772 switch (action) {
773 case CPU_UP_CANCELED_FROZEN:
774 /* The CPU refused to come up during a system resume */
775 microcode_fini_cpu(cpu);
776 break;
777 case CPU_ONLINE:
778 case CPU_DOWN_FAILED:
779 mc_sysdev_add(sys_dev);
780 break;
781 case CPU_ONLINE_FROZEN:
782 /* System-wide resume is in progress, try to apply microcode */
783 if (apply_microcode_check_cpu(cpu)) {
784 /* The application of microcode failed */
785 microcode_fini_cpu(cpu);
786 __mc_sysdev_add(sys_dev, 1);
787 break;
788 }
789 case CPU_DOWN_FAILED_FROZEN:
790 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
791 printk(KERN_ERR "microcode: Failed to create the sysfs "
792 "group for CPU%d\n", cpu);
793 break;
794 case CPU_DOWN_PREPARE:
795 mc_sysdev_remove(sys_dev);
796 break;
797 case CPU_DOWN_PREPARE_FROZEN:
798 /* Suspend is in progress, only remove the interface */
799 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
800 break;
801 }
802 return NOTIFY_OK;
803}
804
805static struct notifier_block __refdata mc_cpu_notifier = {
806 .notifier_call = mc_cpu_callback,
807};
808
809static int __init microcode_init (void)
810{
811 int error;
812
813 printk(KERN_INFO
814 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
815
816 error = microcode_dev_init();
817 if (error)
818 return error;
819 microcode_pdev = platform_device_register_simple("microcode", -1,
820 NULL, 0);
821 if (IS_ERR(microcode_pdev)) {
822 microcode_dev_exit();
823 return PTR_ERR(microcode_pdev);
824 }
825
826 get_online_cpus();
827 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
828 put_online_cpus();
829 if (error) {
830 microcode_dev_exit();
831 platform_device_unregister(microcode_pdev);
832 return error;
833 }
834
835 register_hotcpu_notifier(&mc_cpu_notifier);
836 return 0;
837}
838
839static void __exit microcode_exit (void)
840{
841 microcode_dev_exit();
842
843 unregister_hotcpu_notifier(&mc_cpu_notifier);
844
845 get_online_cpus();
846 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
847 put_online_cpus();
848
849 platform_device_unregister(microcode_pdev);
850}
851
852module_init(microcode_init)
853module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6b0bb73998dd..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -349,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
349 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
350 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
351 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
352 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
353 346
354#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -460,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
460 .set_fixmap = native_set_fixmap, 453 .set_fixmap = native_set_fixmap,
461}; 454};
462 455
463struct pv_lock_ops pv_lock_ops = {
464#ifdef CONFIG_SMP
465 .spin_is_locked = __ticket_spin_is_locked,
466 .spin_is_contended = __ticket_spin_is_contended,
467
468 .spin_lock = __ticket_spin_lock,
469 .spin_trylock = __ticket_spin_trylock,
470 .spin_unlock = __ticket_spin_unlock,
471#endif
472};
473EXPORT_SYMBOL(pv_lock_ops);
474
475EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
476EXPORT_SYMBOL (pv_cpu_ops); 457EXPORT_SYMBOL (pv_cpu_ops);
477EXPORT_SYMBOL (pv_mmu_ops); 458EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 205188db9626..922c14058f97 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
76 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
77} 77}
78 78
79#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
80#include <asm/nmi.h>
81
82static void cpu_exit_clear(void)
83{
84 int cpu = raw_smp_processor_id();
85
86 idle_task_exit();
87
88 cpu_uninit();
89 irq_ctx_exit(cpu);
90
91 cpu_clear(cpu, cpu_callout_map);
92 cpu_clear(cpu, cpu_callin_map);
93
94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
96}
97
98/* We don't actually take CPU down, just spin without interrupts. */
99static inline void play_dead(void)
100{
101 /* This must be done before dead CPU ack */
102 cpu_exit_clear();
103 mb();
104 /* Ack it */
105 __get_cpu_var(cpu_state) = CPU_DEAD;
106
107 /*
108 * With physical CPU hotplug, we should halt the cpu
109 */
110 local_irq_disable();
111 /* mask all interrupts, flush any and all caches, and halt */
112 wbinvd_halt();
113}
114#else
115static inline void play_dead(void) 80static inline void play_dead(void)
116{ 81{
117 BUG(); 82 BUG();
118} 83}
119#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
120 85
121/* 86/*
122 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6b508ea7110..ca80394ef5b8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -86,30 +86,12 @@ void exit_idle(void)
86 __exit_idle(); 86 __exit_idle();
87} 87}
88 88
89#ifdef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_SMP
90DECLARE_PER_CPU(int, cpu_state);
91
92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void)
95{
96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 local_irq_disable();
104 /* mask all interrupts, flush any and all caches, and halt */
105 wbinvd_halt();
106}
107#else
108static inline void play_dead(void) 90static inline void play_dead(void)
109{ 91{
110 BUG(); 92 BUG();
111} 93}
112#endif /* CONFIG_HOTPLUG_CPU */ 94#endif
113 95
114/* 96/*
115 * The idle thread. There's no useful work to be 97 * The idle thread. There's no useful work to be
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e375b658efc3..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -40,7 +40,9 @@ enum x86_regset {
40 REGSET_GENERAL, 40 REGSET_GENERAL,
41 REGSET_FP, 41 REGSET_FP,
42 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
43 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
44}; 46};
45 47
46/* 48/*
@@ -555,6 +557,29 @@ static int ptrace_set_debugreg(struct task_struct *child,
555 return 0; 557 return 0;
556} 558}
557 559
560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
569
570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
574{
575 if (!target->thread.io_bitmap_ptr)
576 return -ENXIO;
577
578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
558#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
559/* 584/*
560 * The configuration for a particular BTS hardware implementation. 585 * The configuration for a particular BTS hardware implementation.
@@ -1385,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1385 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1386 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1387 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1388}; 1419};
1389 1420
1390static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1431,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1431 .active = regset_tls_active, 1462 .active = regset_tls_active,
1432 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1433 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1434}; 1471};
1435 1472
1436static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1452,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1452#endif 1489#endif
1453} 1490}
1454 1491
1455void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1493 int error_code, int si_code)
1456{ 1494{
1457 struct siginfo info; 1495 struct siginfo info;
1458 1496
@@ -1461,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1461 1499
1462 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1463 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1464 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1465 1503
1466 /* User-mode ip? */ 1504 /* User-mode ip? */
1467 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1548,5 +1586,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1548 */ 1586 */
1549 if (test_thread_flag(TIF_SINGLESTEP) && 1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1550 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) 1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1551 send_sigtrap(current, regs, 0); 1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1552} 1590}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 46c98efbbf8d..21b8e0a59780 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -582,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata;
582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
583 583
584/* 584/*
585 * Some BIOSes seem to corrupt the low 64k of memory during events
586 * like suspend/resume and unplugging an HDMI cable. Reserve all
587 * remaining free memory in that area and fill it with a distinct
588 * pattern.
589 */
590#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
591#define MAX_SCAN_AREAS 8
592
593static int __read_mostly memory_corruption_check = -1;
594
595static unsigned __read_mostly corruption_check_size = 64*1024;
596static unsigned __read_mostly corruption_check_period = 60; /* seconds */
597
598static struct e820entry scan_areas[MAX_SCAN_AREAS];
599static int num_scan_areas;
600
601
602static int set_corruption_check(char *arg)
603{
604 char *end;
605
606 memory_corruption_check = simple_strtol(arg, &end, 10);
607
608 return (*end == 0) ? 0 : -EINVAL;
609}
610early_param("memory_corruption_check", set_corruption_check);
611
612static int set_corruption_check_period(char *arg)
613{
614 char *end;
615
616 corruption_check_period = simple_strtoul(arg, &end, 10);
617
618 return (*end == 0) ? 0 : -EINVAL;
619}
620early_param("memory_corruption_check_period", set_corruption_check_period);
621
622static int set_corruption_check_size(char *arg)
623{
624 char *end;
625 unsigned size;
626
627 size = memparse(arg, &end);
628
629 if (*end == '\0')
630 corruption_check_size = size;
631
632 return (size == corruption_check_size) ? 0 : -EINVAL;
633}
634early_param("memory_corruption_check_size", set_corruption_check_size);
635
636
637static void __init setup_bios_corruption_check(void)
638{
639 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
640
641 if (memory_corruption_check == -1) {
642 memory_corruption_check =
643#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
644 1
645#else
646 0
647#endif
648 ;
649 }
650
651 if (corruption_check_size == 0)
652 memory_corruption_check = 0;
653
654 if (!memory_corruption_check)
655 return;
656
657 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
658
659 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
660 u64 size;
661 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
662
663 if (addr == 0)
664 break;
665
666 if ((addr + size) > corruption_check_size)
667 size = corruption_check_size - addr;
668
669 if (size == 0)
670 break;
671
672 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
673 scan_areas[num_scan_areas].addr = addr;
674 scan_areas[num_scan_areas].size = size;
675 num_scan_areas++;
676
677 /* Assume we've already mapped this early memory */
678 memset(__va(addr), 0, size);
679
680 addr += size;
681 }
682
683 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
684 num_scan_areas);
685 update_e820();
686}
687
688static struct timer_list periodic_check_timer;
689
690void check_for_bios_corruption(void)
691{
692 int i;
693 int corruption = 0;
694
695 if (!memory_corruption_check)
696 return;
697
698 for(i = 0; i < num_scan_areas; i++) {
699 unsigned long *addr = __va(scan_areas[i].addr);
700 unsigned long size = scan_areas[i].size;
701
702 for(; size; addr++, size -= sizeof(unsigned long)) {
703 if (!*addr)
704 continue;
705 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
706 addr, __pa(addr), *addr);
707 corruption = 1;
708 *addr = 0;
709 }
710 }
711
712 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
713}
714
715static void periodic_check_for_corruption(unsigned long data)
716{
717 check_for_bios_corruption();
718 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
719}
720
721void start_periodic_check_for_corruption(void)
722{
723 if (!memory_corruption_check || corruption_check_period == 0)
724 return;
725
726 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
727 corruption_check_period);
728
729 init_timer(&periodic_check_timer);
730 periodic_check_timer.function = &periodic_check_for_corruption;
731 periodic_check_for_corruption(0);
732}
733#endif
734
735static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
736{
737 printk(KERN_NOTICE
738 "%s detected: BIOS may corrupt low RAM, working it around.\n",
739 d->ident);
740
741 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
742 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
743
744 return 0;
745}
746
747/* List of systems that have known low memory corruption BIOS problems */
748static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
749#ifdef CONFIG_X86_RESERVE_LOW_64K
750 {
751 .callback = dmi_low_memory_corruption,
752 .ident = "AMI BIOS",
753 .matches = {
754 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
755 },
756 },
757 {
758 .callback = dmi_low_memory_corruption,
759 .ident = "Phoenix BIOS",
760 .matches = {
761 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
762 },
763 },
764#endif
765 {}
766};
767
768/*
585 * Determine if we were loaded by an EFI loader. If so, then we have also been 769 * Determine if we were loaded by an EFI loader. If so, then we have also been
586 * passed the efi memmap, systab, etc., so we should use these data structures 770 * passed the efi memmap, systab, etc., so we should use these data structures
587 * for initialization. Note, the efi init code path is determined by the 771 * for initialization. Note, the efi init code path is determined by the
@@ -715,6 +899,10 @@ void __init setup_arch(char **cmdline_p)
715 899
716 finish_e820_parsing(); 900 finish_e820_parsing();
717 901
902 dmi_scan_machine();
903
904 dmi_check_system(bad_bios_dmi_table);
905
718#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
719 probe_roms(); 907 probe_roms();
720#endif 908#endif
@@ -771,6 +959,10 @@ void __init setup_arch(char **cmdline_p)
771 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 959 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
772#endif 960#endif
773 961
962#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
963 setup_bios_corruption_check();
964#endif
965
774 /* max_pfn_mapped is updated here */ 966 /* max_pfn_mapped is updated here */
775 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 967 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
776 max_pfn_mapped = max_low_pfn_mapped; 968 max_pfn_mapped = max_low_pfn_mapped;
@@ -799,8 +991,6 @@ void __init setup_arch(char **cmdline_p)
799 vsmp_init(); 991 vsmp_init();
800#endif 992#endif
801 993
802 dmi_scan_machine();
803
804 io_delay_init(); 994 io_delay_init();
805 995
806 /* 996 /*
@@ -903,3 +1093,5 @@ void __init setup_arch(char **cmdline_p)
903#endif 1093#endif
904#endif 1094#endif
905} 1095}
1096
1097
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index b21070ea33a4..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -27,6 +27,7 @@
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/i387.h> 28#include <asm/i387.h>
29#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
30#include <asm/syscalls.h> 31#include <asm/syscalls.h>
31 32
32#include "sigframe.h" 33#include "sigframe.h"
@@ -112,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
112 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
113} 114}
114 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
115 137
116/* 138/*
117 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -120,28 +142,13 @@ static int
120restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
121 unsigned long *pax) 143 unsigned long *pax)
122{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
123 unsigned int err = 0; 147 unsigned int err = 0;
124 148
125 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
126 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
127 151
128#define COPY(x) err |= __get_user(regs->x, &sc->x)
129
130#define COPY_SEG(seg) \
131 { unsigned short tmp; \
132 err |= __get_user(tmp, &sc->seg); \
133 regs->seg = tmp; }
134
135#define COPY_SEG_STRICT(seg) \
136 { unsigned short tmp; \
137 err |= __get_user(tmp, &sc->seg); \
138 regs->seg = tmp|3; }
139
140#define GET_SEG(seg) \
141 { unsigned short tmp; \
142 err |= __get_user(tmp, &sc->seg); \
143 loadsegment(seg, tmp); }
144
145 GET_SEG(gs); 152 GET_SEG(gs);
146 COPY_SEG(fs); 153 COPY_SEG(fs);
147 COPY_SEG(es); 154 COPY_SEG(es);
@@ -151,21 +158,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
151 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
152 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
153 160
154 { 161 err |= __get_user(tmpflags, &sc->flags);
155 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
156 163 regs->orig_ax = -1; /* disable syscall checks */
157 err |= __get_user(tmpflags, &sc->flags);
158 regs->flags = (regs->flags & ~FIX_EFLAGS) |
159 (tmpflags & FIX_EFLAGS);
160 regs->orig_ax = -1; /* disable syscall checks */
161 }
162
163 {
164 void __user *buf;
165 164
166 err |= __get_user(buf, &sc->fpstate); 165 err |= __get_user(buf, &sc->fpstate);
167 err |= restore_i387_xstate(buf); 166 err |= restore_i387_xstate(buf);
168 }
169 167
170 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
171 return err; 169 return err;
@@ -214,9 +212,8 @@ badframe:
214 return 0; 212 return 0;
215} 213}
216 214
217asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
218{ 216{
219 struct pt_regs *regs = (struct pt_regs *)&__unused;
220 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
221 unsigned long ax; 218 unsigned long ax;
222 sigset_t set; 219 sigset_t set;
@@ -242,10 +239,17 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
242 return ax; 239 return ax;
243 240
244badframe: 241badframe:
245 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
246 return 0; 243 return 0;
247} 244}
248 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
249/* 253/*
250 * Set up a signal frame. 254 * Set up a signal frame.
251 */ 255 */
@@ -337,39 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
337} 341}
338 342
339static int 343static int
340setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
341 struct pt_regs *regs) 345 struct pt_regs *regs)
342{ 346{
343 struct sigframe __user *frame; 347 struct sigframe __user *frame;
344 void __user *restorer; 348 void __user *restorer;
345 int err = 0; 349 int err = 0;
346 int usig;
347 void __user *fpstate = NULL; 350 void __user *fpstate = NULL;
348 351
349 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
350 353
351 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
352 goto give_sigsegv; 355 return -EFAULT;
353 356
354 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
355 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
356 && sig < 32
357 ? current_thread_info()->exec_domain->signal_invmap[sig]
358 : sig;
359 359
360 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
361 if (err) 361 return -EFAULT;
362 goto give_sigsegv;
363
364 err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]);
365 if (err)
366 goto give_sigsegv;
367 362
368 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
369 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
370 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
371 if (err) 366 return -EFAULT;
372 goto give_sigsegv;
373 } 367 }
374 368
375 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -394,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
394 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
395 389
396 if (err) 390 if (err)
397 goto give_sigsegv; 391 return -EFAULT;
398 392
399 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
400 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -409,38 +403,27 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
409 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
410 404
411 return 0; 405 return 0;
412
413give_sigsegv:
414 force_sigsegv(sig, current);
415 return -EFAULT;
416} 406}
417 407
418static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
419 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
420{ 410{
421 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
422 void __user *restorer; 412 void __user *restorer;
423 int err = 0; 413 int err = 0;
424 int usig;
425 void __user *fpstate = NULL; 414 void __user *fpstate = NULL;
426 415
427 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
428 417
429 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
430 goto give_sigsegv; 419 return -EFAULT;
431
432 usig = current_thread_info()->exec_domain
433 && current_thread_info()->exec_domain->signal_invmap
434 && sig < 32
435 ? current_thread_info()->exec_domain->signal_invmap[sig]
436 : sig;
437 420
438 err |= __put_user(usig, &frame->sig); 421 err |= __put_user(sig, &frame->sig);
439 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
440 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
441 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
442 if (err) 425 if (err)
443 goto give_sigsegv; 426 return -EFAULT;
444 427
445 /* Create the ucontext. */ 428 /* Create the ucontext. */
446 if (cpu_has_xsave) 429 if (cpu_has_xsave)
@@ -456,7 +439,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
456 regs, set->sig[0]); 439 regs, set->sig[0]);
457 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
458 if (err) 441 if (err)
459 goto give_sigsegv; 442 return -EFAULT;
460 443
461 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
462 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -476,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
476 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
477 460
478 if (err) 461 if (err)
479 goto give_sigsegv; 462 return -EFAULT;
480 463
481 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
482 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
483 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
484 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
485 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
486 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
487 470
@@ -491,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
491 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
492 475
493 return 0; 476 return 0;
494
495give_sigsegv:
496 force_sigsegv(sig, current);
497 return -EFAULT;
498} 477}
499 478
500/* 479/*
501 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
502 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
503static int 519static int
504handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
505 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -507,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
507 int ret; 523 int ret;
508 524
509 /* Are we from a system call? */ 525 /* Are we from a system call? */
510 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
511 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
512 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
513 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
514 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
515 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -536,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
536 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
537 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
538 554
539 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
540 if (ka->sa.sa_flags & SA_SIGINFO)
541 ret = setup_rt_frame(sig, ka, info, oldset, regs);
542 else
543 ret = setup_frame(sig, ka, oldset, regs);
544 556
545 if (ret) 557 if (ret)
546 return ret; 558 return ret;
547 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
548 /* 569 /*
549 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
550 */ 571 */
@@ -571,6 +592,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,6 +679,12 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
683 /* notify userspace of pending MCEs */
684 if (thread_info_flags & _TIF_MCE_NOTIFY)
685 mce_notify_user();
686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
687
660 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
661 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
662 do_signal(regs); 690 do_signal(regs);
@@ -666,5 +694,23 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
666 tracehook_notify_resume(regs); 694 tracehook_notify_resume(regs);
667 } 695 }
668 696
697#ifdef CONFIG_X86_32
669 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
670} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 823a55bf8c39..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -52,6 +52,16 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
52 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
53} 53}
54 54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
64
55/* 65/*
56 * Do a signal return; undo the signal stack. 66 * Do a signal return; undo the signal stack.
57 */ 67 */
@@ -59,13 +69,13 @@ static int
59restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
60 unsigned long *pax) 70 unsigned long *pax)
61{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
62 unsigned int err = 0; 74 unsigned int err = 0;
63 75
64 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
65 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
66 78
67#define COPY(x) (err |= __get_user(regs->x, &sc->x))
68
69 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
70 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
71 COPY(r8); 81 COPY(r8);
@@ -80,34 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
80 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
81 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
82 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
83 { 93 COPY_SEG_STRICT(cs);
84 unsigned cs;
85 err |= __get_user(cs, &sc->cs);
86 regs->cs = cs | 3; /* Force into user mode */
87 }
88 94
89 { 95 err |= __get_user(tmpflags, &sc->flags);
90 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
91 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
92 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
93 regs->orig_ax = -1; /* disable syscall checks */
94 }
95 98
96 { 99 err |= __get_user(buf, &sc->fpstate);
97 struct _fpstate __user *buf; 100 err |= restore_i387_xstate(buf);
98 err |= __get_user(buf, &sc->fpstate);
99 err |= restore_i387_xstate(buf);
100 }
101 101
102 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
103 return err; 103 return err;
104} 104}
105 105
106asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
107{ 107{
108 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
109 sigset_t set;
110 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
111 111
112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -130,10 +130,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
130 return ax; 130 return ax;
131 131
132badframe: 132badframe:
133 signal_fault(regs, frame, "sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
134 return 0; 134 return 0;
135} 135}
136 136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
137/* 142/*
138 * Set up a signal frame. 143 * Set up a signal frame.
139 */ 144 */
@@ -195,8 +200,8 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
195 return (void __user *)round_down(sp - size, 64); 200 return (void __user *)round_down(sp - size, 64);
196} 201}
197 202
198static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
199 sigset_t *set, struct pt_regs *regs) 204 sigset_t *set, struct pt_regs *regs)
200{ 205{
201 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
202 void __user *fp = NULL; 207 void __user *fp = NULL;
@@ -209,17 +214,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
209 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
210 215
211 if (save_i387_xstate(fp) < 0) 216 if (save_i387_xstate(fp) < 0)
212 err |= -1; 217 return -EFAULT;
213 } else 218 } else
214 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
215 220
216 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
217 goto give_sigsegv; 222 return -EFAULT;
218 223
219 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
220 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
221 if (err) 226 return -EFAULT;
222 goto give_sigsegv;
223 } 227 }
224 228
225 /* Create the ucontext. */ 229 /* Create the ucontext. */
@@ -247,11 +251,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
247 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
248 } else { 252 } else {
249 /* could use a vstub here */ 253 /* could use a vstub here */
250 goto give_sigsegv; 254 return -EFAULT;
251 } 255 }
252 256
253 if (err) 257 if (err)
254 goto give_sigsegv; 258 return -EFAULT;
255 259
256 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
257 regs->di = sig; 261 regs->di = sig;
@@ -271,15 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
271 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
272 276
273 return 0; 277 return 0;
274
275give_sigsegv:
276 force_sigsegv(sig, current);
277 return -EFAULT;
278} 278}
279 279
280/* 280/*
281 * OK, we're invoking a handler 281 * OK, we're invoking a handler
282 */ 282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
283 317
284static int 318static int
285handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -317,51 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
317 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
318 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
319 353
320#ifdef CONFIG_IA32_EMULATION
321 if (test_thread_flag(TIF_IA32)) {
322 if (ka->sa.sa_flags & SA_SIGINFO)
323 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
324 else
325 ret = ia32_setup_frame(sig, ka, oldset, regs);
326 } else
327#endif
328 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
329 355
330 if (ret == 0) { 356 if (ret)
331 /* 357 return ret;
332 * This has nothing to do with segment registers,
333 * despite the name. This magic affects uaccess.h
334 * macros' behavior. Reset it to the normal setting.
335 */
336 set_fs(USER_DS);
337 358
338 /* 359#ifdef CONFIG_X86_64
339 * Clear the direction flag as per the ABI for function entry. 360 /*
340 */ 361 * This has nothing to do with segment registers,
341 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
342 367
343 /* 368 /*
344 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
345 * notify any tracer that was single-stepping it. 370 */
346 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
347 * handler too.
348 */
349 regs->flags &= ~X86_EFLAGS_TF;
350 372
351 spin_lock_irq(&current->sighand->siglock); 373 /*
352 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 374 * Clear TF when entering the signal handler, but
353 if (!(ka->sa.sa_flags & SA_NODEFER)) 375 * notify any tracer that was single-stepping it.
354 sigaddset(&current->blocked, sig); 376 * The tracer may want to single-step inside the
355 recalc_sigpending(); 377 * handler too.
356 spin_unlock_irq(&current->sighand->siglock); 378 */
379 regs->flags &= ~X86_EFLAGS_TF;
357 380
358 tracehook_signal_handler(sig, info, ka, regs, 381 spin_lock_irq(&current->sighand->siglock);
359 test_thread_flag(TIF_SINGLESTEP)); 382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
360 } 383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
361 387
362 return ret; 388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
363} 392}
364 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
365/* 396/*
366 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
367 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -391,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
391 422
392 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
393 if (signr > 0) { 424 if (signr > 0) {
394 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
395 * signal to user space. The processor register will 427 * signal to user space. The processor register will
396 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
397 * inside the kernel. 429 * inside the kernel.
@@ -399,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
399 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
400 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
401 433
402 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
403 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
404 /* 436 /*
405 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -422,10 +454,9 @@ static void do_signal(struct pt_regs *regs)
422 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
423 regs->ip -= 2; 455 regs->ip -= 2;
424 break; 456 break;
457
425 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
426 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
427 __NR_ia32_restart_syscall :
428 __NR_restart_syscall;
429 regs->ip -= 2; 460 regs->ip -= 2;
430 break; 461 break;
431 } 462 }
@@ -441,14 +472,18 @@ static void do_signal(struct pt_regs *regs)
441 } 472 }
442} 473}
443 474
444void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
445 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
446{ 481{
447#ifdef CONFIG_X86_MCE 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
448 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
449 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
450 mce_notify_user(); 485 mce_notify_user();
451#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
452 487
453 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
454 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
@@ -458,17 +493,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
458 clear_thread_flag(TIF_NOTIFY_RESUME); 493 clear_thread_flag(TIF_NOTIFY_RESUME);
459 tracehook_notify_resume(regs); 494 tracehook_notify_resume(regs);
460 } 495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
461} 500}
462 501
463void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
464{ 503{
465 struct task_struct *me = current; 504 struct task_struct *me = current;
505
466 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
467 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
468 me->comm, me->pid, where, frame, regs->ip, 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
469 regs->sp, regs->orig_ax); 509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
470 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
471 printk("\n"); 512 printk(KERN_CONT "\n");
472 } 513 }
473 514
474 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9056f7e272c0..76b6f50978f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -1344,25 +1345,9 @@ static void __ref remove_cpu_from_maps(int cpu)
1344 numa_remove_cpu(cpu); 1345 numa_remove_cpu(cpu);
1345} 1346}
1346 1347
1347int __cpu_disable(void) 1348void cpu_disable_common(void)
1348{ 1349{
1349 int cpu = smp_processor_id(); 1350 int cpu = smp_processor_id();
1350
1351 /*
1352 * Perhaps use cpufreq to drop frequency, but that could go
1353 * into generic code.
1354 *
1355 * We won't take down the boot processor on i386 due to some
1356 * interrupts only being able to be serviced by the BSP.
1357 * Especially so if we're not using an IOAPIC -zwane
1358 */
1359 if (cpu == 0)
1360 return -EBUSY;
1361
1362 if (nmi_watchdog == NMI_LOCAL_APIC)
1363 stop_apic_nmi_watchdog(NULL);
1364 clear_local_APIC();
1365
1366 /* 1351 /*
1367 * HACK: 1352 * HACK:
1368 * Allow any queued timer interrupts to get serviced 1353 * Allow any queued timer interrupts to get serviced
@@ -1380,10 +1365,32 @@ int __cpu_disable(void)
1380 remove_cpu_from_maps(cpu); 1365 remove_cpu_from_maps(cpu);
1381 unlock_vector_lock(); 1366 unlock_vector_lock();
1382 fixup_irqs(cpu_online_map); 1367 fixup_irqs(cpu_online_map);
1368}
1369
1370int native_cpu_disable(void)
1371{
1372 int cpu = smp_processor_id();
1373
1374 /*
1375 * Perhaps use cpufreq to drop frequency, but that could go
1376 * into generic code.
1377 *
1378 * We won't take down the boot processor on i386 due to some
1379 * interrupts only being able to be serviced by the BSP.
1380 * Especially so if we're not using an IOAPIC -zwane
1381 */
1382 if (cpu == 0)
1383 return -EBUSY;
1384
1385 if (nmi_watchdog == NMI_LOCAL_APIC)
1386 stop_apic_nmi_watchdog(NULL);
1387 clear_local_APIC();
1388
1389 cpu_disable_common();
1383 return 0; 1390 return 0;
1384} 1391}
1385 1392
1386void __cpu_die(unsigned int cpu) 1393void native_cpu_die(unsigned int cpu)
1387{ 1394{
1388 /* We don't do anything here: idle task is faking death itself. */ 1395 /* We don't do anything here: idle task is faking death itself. */
1389 unsigned int i; 1396 unsigned int i;
@@ -1400,15 +1407,45 @@ void __cpu_die(unsigned int cpu)
1400 } 1407 }
1401 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1408 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1402} 1409}
1410
1411void play_dead_common(void)
1412{
1413 idle_task_exit();
1414 reset_lazy_tlbstate();
1415 irq_ctx_exit(raw_smp_processor_id());
1416 c1e_remove_cpu(raw_smp_processor_id());
1417
1418 mb();
1419 /* Ack it */
1420 __get_cpu_var(cpu_state) = CPU_DEAD;
1421
1422 /*
1423 * With physical CPU hotplug, we should halt the cpu
1424 */
1425 local_irq_disable();
1426}
1427
1428void native_play_dead(void)
1429{
1430 play_dead_common();
1431 wbinvd_halt();
1432}
1433
1403#else /* ... !CONFIG_HOTPLUG_CPU */ 1434#else /* ... !CONFIG_HOTPLUG_CPU */
1404int __cpu_disable(void) 1435int native_cpu_disable(void)
1405{ 1436{
1406 return -ENOSYS; 1437 return -ENOSYS;
1407} 1438}
1408 1439
1409void __cpu_die(unsigned int cpu) 1440void native_cpu_die(unsigned int cpu)
1410{ 1441{
1411 /* We said "no" in __cpu_disable */ 1442 /* We said "no" in __cpu_disable */
1412 BUG(); 1443 BUG();
1413} 1444}
1445
1446void native_play_dead(void)
1447{
1448 BUG();
1449}
1450
1414#endif 1451#endif
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index da5a5964fccb..0429c5de5ea9 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
891{ 891{
892 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
893 unsigned int condition; 893 unsigned int condition;
894 int si_code;
894 895
895 trace_hardirqs_fixup(); 896 trace_hardirqs_fixup();
896 897
@@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
935 goto clear_TF_reenable; 936 goto clear_TF_reenable;
936 } 937 }
937 938
939 si_code = get_si_code((unsigned long)condition);
938 /* Ok, finally something we can handle */ 940 /* Ok, finally something we can handle */
939 send_sigtrap(tsk, regs, error_code); 941 send_sigtrap(tsk, regs, error_code, si_code);
940 942
941 /* 943 /*
942 * Disable additional traps. They'll be re-enabled when 944 * Disable additional traps. They'll be re-enabled when
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2887a789e38f..9c0ac0cab013 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -940,7 +940,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs,
940 tsk->thread.error_code = error_code; 940 tsk->thread.error_code = error_code;
941 info.si_signo = SIGTRAP; 941 info.si_signo = SIGTRAP;
942 info.si_errno = 0; 942 info.si_errno = 0;
943 info.si_code = TRAP_BRKPT; 943 info.si_code = get_si_code(condition);
944 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; 944 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
945 force_sig_info(SIGTRAP, &info, tsk); 945 force_sig_info(SIGTRAP, &info, tsk);
946 946
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 07713d64debe..9abac8a9d823 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -95,7 +95,9 @@ int save_i387_xstate(void __user *buf)
95 * Start with clearing the user buffer. This will present a 95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave. 96 * clean context for the bytes not touched by the fxsave/xsave.
97 */ 97 */
98 __clear_user(buf, sig_xstate_size); 98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
99 101
100 if (task_thread_info(tsk)->status & TS_XSAVE) 102 if (task_thread_info(tsk)->status & TS_XSAVE)
101 err = xsave_user(buf); 103 err = xsave_user(buf);
@@ -114,6 +116,8 @@ int save_i387_xstate(void __user *buf)
114 116
115 if (task_thread_info(tsk)->status & TS_XSAVE) { 117 if (task_thread_info(tsk)->status & TS_XSAVE) {
116 struct _fpstate __user *fx = buf; 118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
117 121
118 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, 122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
119 sizeof(struct _fpx_sw_bytes)); 123 sizeof(struct _fpx_sw_bytes));
@@ -121,6 +125,31 @@ int save_i387_xstate(void __user *buf)
121 err |= __put_user(FP_XSTATE_MAGIC2, 125 err |= __put_user(FP_XSTATE_MAGIC2,
122 (__u32 __user *) (buf + sig_xstate_size 126 (__u32 __user *) (buf + sig_xstate_size
123 - FP_XSTATE_MAGIC2_SIZE)); 127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
124 } 153 }
125 154
126 return 1; 155 return 1;
@@ -272,7 +301,7 @@ void __cpuinit xsave_init(void)
272/* 301/*
273 * setup the xstate image representing the init state 302 * setup the xstate image representing the init state
274 */ 303 */
275void setup_xstate_init(void) 304static void __init setup_xstate_init(void)
276{ 305{
277 init_xstate_buf = alloc_bootmem(xstate_size); 306 init_xstate_buf = alloc_bootmem(xstate_size);
278 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8f92cac4e6db..a742d753d5b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -914,15 +914,15 @@ LIST_HEAD(pgd_list);
914 914
915void vmalloc_sync_all(void) 915void vmalloc_sync_all(void)
916{ 916{
917#ifdef CONFIG_X86_32
918 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 unsigned long address; 917 unsigned long address;
920 918
919#ifdef CONFIG_X86_32
921 if (SHARED_KERNEL_PMD) 920 if (SHARED_KERNEL_PMD)
922 return; 921 return;
923 922
924 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 923 for (address = VMALLOC_START & PMD_MASK;
925 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 924 address >= TASK_SIZE && address < FIXADDR_TOP;
925 address += PMD_SIZE) {
926 unsigned long flags; 926 unsigned long flags;
927 struct page *page; 927 struct page *page;
928 928
@@ -935,10 +935,8 @@ void vmalloc_sync_all(void)
935 spin_unlock_irqrestore(&pgd_lock, flags); 935 spin_unlock_irqrestore(&pgd_lock, flags);
936 } 936 }
937#else /* CONFIG_X86_64 */ 937#else /* CONFIG_X86_64 */
938 unsigned long start = VMALLOC_START & PGDIR_MASK; 938 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
939 unsigned long address; 939 address += PGDIR_SIZE) {
940
941 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
942 const pgd_t *pgd_ref = pgd_offset_k(address); 940 const pgd_t *pgd_ref = pgd_offset_k(address);
943 unsigned long flags; 941 unsigned long flags;
944 struct page *page; 942 struct page *page;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c3789bb19308..bbe044dbe014 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -969,6 +970,8 @@ void __init mem_init(void)
969 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
970 int tmp; 971 int tmp;
971 972
973 start_periodic_check_for_corruption();
974
972#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
973 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
974#endif 977#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 83e13f2d53d2..3e10054c5731 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -881,6 +882,8 @@ void __init mem_init(void)
881{ 882{
882 long codesize, reservedpages, datasize, initsize; 883 long codesize, reservedpages, datasize, initsize;
883 884
885 start_periodic_check_for_corruption();
886
884 pci_iommu_alloc(); 887 pci_iommu_alloc();
885 888
886 /* clear_bss() already clear the empty_zero_page */ 889 /* clear_bss() already clear the empty_zero_page */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 10b52309aefd..8cbeda15cd29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,26 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27unsigned long __phys_addr(unsigned long x) 27static inline int phys_addr_valid(unsigned long addr)
28{ 28{
29 if (x >= __START_KERNEL_map) 29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30 return x - __START_KERNEL_map + phys_base;
31 return x - PAGE_OFFSET;
32} 30}
33EXPORT_SYMBOL(__phys_addr);
34 31
35static inline int phys_addr_valid(unsigned long addr) 32unsigned long __phys_addr(unsigned long x)
36{ 33{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 34 if (x >= __START_KERNEL_map) {
35 x -= __START_KERNEL_map;
36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
38} 45}
46EXPORT_SYMBOL(__phys_addr);
39 47
40#else 48#else
41 49
@@ -44,6 +52,17 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 52 return 1;
45} 53}
46 54
55#ifdef CONFIG_DEBUG_VIRTUAL
56unsigned long __phys_addr(unsigned long x)
57{
58 /* VMALLOC_* aren't constants; not available at the boot time */
59 VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING &&
60 is_vmalloc_addr((void *)x)));
61 return x - PAGE_OFFSET;
62}
63EXPORT_SYMBOL(__phys_addr);
64#endif
65
47#endif 66#endif
48 67
49int page_is_ram(unsigned long pagenr) 68int page_is_ram(unsigned long pagenr)
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f470..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
26 26
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on XEN && PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed2..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a27d562a9744..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,7 +30,6 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
@@ -58,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
58DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
59DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
60 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
61/* 63/*
62 * Identity map, in addition to plain kernel map. This needs to be 64 * Identity map, in addition to plain kernel map. This needs to be
63 * large enough to allocate page table pages to allocate the rest. 65 * large enough to allocate page table pages to allocate the rest.
@@ -111,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
111 * 113 *
112 * 0: not available, 1: available 114 * 0: not available, 1: available
113 */ 115 */
114static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
115 124
116static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
117{ 126{
@@ -227,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
227 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
228} 237}
229 238
230static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
231{ 240{
232 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
233 unsigned long flags; 242 xen_mc_flush();
234
235 vcpu = x86_read_percpu(xen_vcpu);
236
237 /* flag has opposite sense of mask */
238 flags = !vcpu->evtchn_upcall_mask;
239
240 /* convert to IF type flag
241 -0 -> 0x00000000
242 -1 -> 0xffffffff
243 */
244 return (-flags) & X86_EFLAGS_IF;
245} 243}
246 244
247static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
248{ 246{
249 struct vcpu_info *vcpu; 247 return 0;
250
251 /* convert from IF type flag */
252 flags = !(flags & X86_EFLAGS_IF);
253
254 /* There's a one instruction preempt window here. We need to
255 make sure we're don't switch CPUs between getting the vcpu
256 pointer and updating the mask. */
257 preempt_disable();
258 vcpu = x86_read_percpu(xen_vcpu);
259 vcpu->evtchn_upcall_mask = flags;
260 preempt_enable_no_resched();
261
262 /* Doesn't matter if we get preempted here, because any
263 pending event will get dealt with anyway. */
264
265 if (flags == 0) {
266 preempt_check_resched();
267 barrier(); /* unmask then check (avoid races) */
268 if (unlikely(vcpu->evtchn_upcall_pending))
269 force_evtchn_callback();
270 }
271} 248}
272 249
273static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
274{ 257{
275 /* There's a one instruction preempt window here. We need to 258 int level;
276 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
277 pointer and updating the mask. */ 260 pte_t pte;
278 preempt_disable(); 261 unsigned long pfn;
279 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
280 preempt_enable_no_resched();
281}
282 263
283static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
284{ 265 BUG_ON(ptep == NULL);
285 struct vcpu_info *vcpu;
286 266
287 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
288 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
289 the caller is confused and is trying to re-enable interrupts
290 on an indeterminate processor. */
291 269
292 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
293 vcpu->evtchn_upcall_mask = 0;
294 271
295 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
296 pending event will get dealt with anyway. */ 273 BUG();
297 274
298 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
299 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
300 force_evtchn_callback();
301}
302 277
303static void xen_safe_halt(void) 278 if (av != v)
304{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
305 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
306 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
307 BUG(); 282 kmap_flush_unused();
308} 283}
309 284
310static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
311{ 286{
312 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
313 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
314 else
315 xen_safe_halt();
316}
317 289
318static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
319{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
320 paravirt_leave_lazy(paravirt_get_lazy_mode());
321 xen_mc_flush();
322} 292}
323 293
324static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
325{ 295{
326 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
327} 301}
328 302
329static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -426,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
426static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
427 const void *ptr) 401 const void *ptr)
428{ 402{
429 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
430 xmaddr_t mach_lp = virt_to_machine(lp);
431 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
432 405
433 preempt_disable(); 406 preempt_disable();
@@ -560,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
560} 533}
561 534
562static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
563 struct thread_struct *thread) 536 struct thread_struct *thread)
564{ 537{
565 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
566 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -835,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
835 ret = -EFAULT; 808 ret = -EFAULT;
836 break; 809 break;
837#endif 810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
838 default: 824 default:
839 ret = native_write_msr_safe(msr, low, high); 825 ret = native_write_msr_safe(msr, low, high);
840 } 826 }
@@ -878,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
878 SetPagePinned(page); 864 SetPagePinned(page);
879 865
880 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
881 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
882 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
883 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
884 } else 870 } else
885 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -947,7 +933,7 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level)
947 933
948 if (PagePinned(page)) { 934 if (PagePinned(page)) {
949 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
950 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
951 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
952 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
953 } 939 }
@@ -994,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
994} 980}
995#endif 981#endif
996 982
983#ifdef CONFIG_X86_32
997static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
998{ 985{
999 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -1012,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1012 999
1013 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
1014} 1001}
1002#endif
1015 1003
1016static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
1017{ 1005{
@@ -1078,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void)
1078 1066
1079 /* xen_vcpu_setup managed to place the vcpu_info within the 1067 /* xen_vcpu_setup managed to place the vcpu_info within the
1080 percpu area for all cpus, so make use of it */ 1068 percpu area for all cpus, so make use of it */
1081#ifdef CONFIG_X86_32
1082 if (have_vcpu_info_placement) { 1069 if (have_vcpu_info_placement) {
1083 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1070 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1084 1071
@@ -1088,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void)
1088 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1075 pv_irq_ops.irq_enable = xen_irq_enable_direct;
1089 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1076 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1090 } 1077 }
1091#endif
1092} 1078}
1093 1079
1094static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1080static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1109,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1109 goto patch_site 1095 goto patch_site
1110 1096
1111 switch (type) { 1097 switch (type) {
1112#ifdef CONFIG_X86_32
1113 SITE(pv_irq_ops, irq_enable); 1098 SITE(pv_irq_ops, irq_enable);
1114 SITE(pv_irq_ops, irq_disable); 1099 SITE(pv_irq_ops, irq_disable);
1115 SITE(pv_irq_ops, save_fl); 1100 SITE(pv_irq_ops, save_fl);
1116 SITE(pv_irq_ops, restore_fl); 1101 SITE(pv_irq_ops, restore_fl);
1117#endif /* CONFIG_X86_32 */
1118#undef SITE 1102#undef SITE
1119 1103
1120 patch_site: 1104 patch_site:
@@ -1252,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1252 .load_gs_index = xen_load_gs_index, 1236 .load_gs_index = xen_load_gs_index,
1253#endif 1237#endif
1254 1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1241
1255 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1256 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
1257 .store_tr = xen_store_tr, 1244 .store_tr = xen_store_tr,
@@ -1273,36 +1260,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1273 }, 1260 },
1274}; 1261};
1275 1262
1276static void __init __xen_init_IRQ(void)
1277{
1278#ifdef CONFIG_X86_64
1279 int i;
1280
1281 /* Create identity vector->irq map */
1282 for(i = 0; i < NR_VECTORS; i++) {
1283 int cpu;
1284
1285 for_each_possible_cpu(cpu)
1286 per_cpu(vector_irq, cpu)[i] = i;
1287 }
1288#endif /* CONFIG_X86_64 */
1289
1290 xen_init_IRQ();
1291}
1292
1293static const struct pv_irq_ops xen_irq_ops __initdata = {
1294 .init_IRQ = __xen_init_IRQ,
1295 .save_fl = xen_save_fl,
1296 .restore_fl = xen_restore_fl,
1297 .irq_disable = xen_irq_disable,
1298 .irq_enable = xen_irq_enable,
1299 .safe_halt = xen_safe_halt,
1300 .halt = xen_halt,
1301#ifdef CONFIG_X86_64
1302 .adjust_exception_frame = xen_adjust_exception_frame,
1303#endif
1304};
1305
1306static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1307#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1308 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
@@ -1443,7 +1400,7 @@ static void __init xen_reserve_top(void)
1443 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1444 top = pp.virt_start; 1401 top = pp.virt_start;
1445 1402
1446 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1447#endif /* CONFIG_X86_32 */ 1404#endif /* CONFIG_X86_32 */
1448} 1405}
1449 1406
@@ -1477,48 +1434,11 @@ static void *m2v(phys_addr_t maddr)
1477 return __ka(m2p(maddr)); 1434 return __ka(m2p(maddr));
1478} 1435}
1479 1436
1480#ifdef CONFIG_X86_64
1481static void walk(pgd_t *pgd, unsigned long addr)
1482{
1483 unsigned l4idx = pgd_index(addr);
1484 unsigned l3idx = pud_index(addr);
1485 unsigned l2idx = pmd_index(addr);
1486 unsigned l1idx = pte_index(addr);
1487 pgd_t l4;
1488 pud_t l3;
1489 pmd_t l2;
1490 pte_t l1;
1491
1492 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1493 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1494
1495 l4 = pgd[l4idx];
1496 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1497 xen_raw_printk(" %016lx\n", pgd_val(l4));
1498
1499 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1500 xen_raw_printk(" l3: %016lx\n", l3.pud);
1501 xen_raw_printk(" %016lx\n", pud_val(l3));
1502
1503 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1504 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1505 xen_raw_printk(" %016lx\n", pmd_val(l2));
1506
1507 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1508 xen_raw_printk(" l1: %016lx\n", l1.pte);
1509 xen_raw_printk(" %016lx\n", pte_val(l1));
1510}
1511#endif
1512
1513static void set_page_prot(void *addr, pgprot_t prot) 1437static void set_page_prot(void *addr, pgprot_t prot)
1514{ 1438{
1515 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1516 pte_t pte = pfn_pte(pfn, prot); 1440 pte_t pte = pfn_pte(pfn, prot);
1517 1441
1518 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1519 addr, pfn, get_phys_to_machine(pfn),
1520 pgprot_val(prot), pte.pte);
1521
1522 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1523 BUG(); 1443 BUG();
1524} 1444}
@@ -1694,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void)
1694 if (!xen_start_info) 1614 if (!xen_start_info)
1695 return; 1615 return;
1696 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1697 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1698 1620
1699 xen_setup_features(); 1621 xen_setup_features();
@@ -1703,10 +1625,11 @@ asmlinkage void __init xen_start_kernel(void)
1703 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1704 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1705 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1706 pv_irq_ops = xen_irq_ops;
1707 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1708 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1709 1630
1631 xen_init_irq_ops();
1632
1710#ifdef CONFIG_X86_LOCAL_APIC 1633#ifdef CONFIG_X86_LOCAL_APIC
1711 /* 1634 /*
1712 * set up the basic apic ops. 1635 * set up the basic apic ops.
@@ -1737,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void)
1737 1660
1738 /* Prevent unwanted bits from being set in PTEs. */ 1661 /* Prevent unwanted bits from being set in PTEs. */
1739 __supported_pte_mask &= ~_PAGE_GLOBAL; 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1740 if (!is_initial_xendomain()) 1663 if (!xen_initial_domain())
1741 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1742 1665
1743 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1772,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void)
1772 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1773 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1774 1697
1775 if (!is_initial_xendomain()) { 1698 if (!xen_initial_domain()) {
1776 add_preferred_console("xenboot", 0, NULL); 1699 add_preferred_console("xenboot", 0, NULL);
1777 add_preferred_console("tty", 0, NULL); 1700 add_preferred_console("tty", 0, NULL);
1778 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
@@ -1780,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void)
1780 1703
1781 xen_raw_console_write("about to get started...\n"); 1704 xen_raw_console_write("about to get started...\n");
1782 1705
1783#if 0
1784 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1785 &boot_params, __pa_symbol(&boot_params),
1786 __va(__pa_symbol(&boot_params)));
1787
1788 walk(pgd, &boot_params);
1789 walk(pgd, __va(__pa(&boot_params)));
1790#endif
1791
1792 /* Start the world */ 1706 /* Start the world */
1793#ifdef CONFIG_X86_32 1707#ifdef CONFIG_X86_32
1794 i386_start_kernel(); 1708 i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 857
735 if (user_pgd) { 858 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 861 }
739 } 862 }
740#else /* CONFIG_X86_32 */ 863#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 864#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 865 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
744#endif 868#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 870#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 871 xen_mc_issue(0);
748} 872}
749 873
874static void xen_pgd_pin(struct mm_struct *mm)
875{
876 __xen_pgd_pin(mm, mm->pgd);
877}
878
750/* 879/*
751 * On save, we need to pin all pagetables to make sure they get their 880 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 882 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
755 */ 888 */
756void xen_mm_pin_all(void) 889void xen_mm_pin_all(void)
757{ 890{
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
762 895
763 list_for_each_entry(page, &pgd_list, lru) { 896 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 897 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 899 SetPageSavePinned(page);
767 } 900 }
768 } 901 }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 908 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 909 * the book-keeping now.
777 */ 910 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
779{ 913{
780 SetPagePinned(page); 914 SetPagePinned(page);
781 return 0; 915 return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 917
784void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
785{ 919{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 921}
788 922
789static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
790{ 925{
791 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
792 927
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 932 struct multicall_space mcs;
798 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
799 if (level == PT_PTE) { 941 if (level == PT_PTE) {
800 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
801 943
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 946 }
804 947
805 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 953
811 if (ptl) { 954 if (ptl) {
812 /* unlock when batch completed */ 955 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
814 } 957 }
815 } 958 }
816 959
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 961}
819 962
820/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 965{
823 xen_mc_batch(); 966 xen_mc_batch();
824 967
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 973
831 if (user_pgd) { 974 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 977 }
835 } 978 }
836#endif 979#endif
837 980
838#ifdef CONFIG_X86_PAE 981#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 982 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
841#endif 985#endif
842 986
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 988
845 xen_mc_issue(0); 989 xen_mc_issue(0);
846} 990}
847 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
848/* 997/*
849 * On resume, undo any pinning done at save, so that the rest of the 998 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 999 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1008 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1009 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1010 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1012 ClearPageSavePinned(page);
864 } 1013 }
865 } 1014 }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1020{
872 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
875} 1024}
876 1025
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1027{
879 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
882} 1031}
883 1032
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
907 } 1056 }
908} 1057}
909 1058
910static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1060{
912 cpumask_t mask; 1061 cpumask_t mask;
913 unsigned cpu; 1062 unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1087}
939#else 1088#else
940static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1090{
942 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
962{ 1111{
963 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
965 put_cpu(); 1114 put_cpu();
966 1115
967 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
968 1117
969 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1119 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
972 1121
973 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
974} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e3..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
19void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
20 20
21void xen_pgd_pin(pgd_t *pgd);
22//void xen_pgd_unpin(pgd_t *pgd);
23
24pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c9776..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1d..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb3..05794c566e87 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */ 26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000 27#define XEN_EFLAGS_NMI 0x80000000
28 28
29#if 0 29#if 1
30#include <asm/percpu.h> 30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
31 38
32/* 39/*
33 Enable events. This clears the event mask and tests the pending 40 Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
35 events, then enter the hypervisor to get them handled. 42 events, then enter the hypervisor to get them handled.
36 */ 43 */
37ENTRY(xen_irq_enable_direct) 44ENTRY(xen_irq_enable_direct)
45 BUG
46
38 /* Unmask events */ 47 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40 49
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
58 non-zero. 67 non-zero.
59 */ 68 */
60ENTRY(xen_irq_disable_direct) 69ENTRY(xen_irq_disable_direct)
70 BUG
71
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct) 73ENDPATCH(xen_irq_disable_direct)
63 ret 74 ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
74 Xen and x86 use opposite senses (mask vs enable). 85 Xen and x86 use opposite senses (mask vs enable).
75 */ 86 */
76ENTRY(xen_save_fl_direct) 87ENTRY(xen_save_fl_direct)
88 BUG
89
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah 91 setz %ah
79 addb %ah,%ah 92 addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
91 if so. 104 if so.
92 */ 105 */
93ENTRY(xen_restore_fl_direct) 106ENTRY(xen_restore_fl_direct)
107 BUG
108
94 testb $X86_EFLAGS_IF>>8, %ah 109 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with 111 /* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
122 push %r9 137 push %r9
123 push %r10 138 push %r10
124 push %r11 139 push %r11
125 call force_evtchn_callback 140 call xen_force_evtchn_callback
126 pop %r11 141 pop %r11
127 pop %r10 142 pop %r10
128 pop %r9 143 pop %r9
@@ -133,7 +148,6 @@ check_events:
133 pop %rcx 148 pop %rcx
134 pop %rax 149 pop %rax
135 ret 150 ret
136#endif
137 151
138ENTRY(xen_adjust_exception_frame) 152ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx 153 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2e..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}