aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2011-03-23 02:34:54 -0400
committerLen Brown <len.brown@intel.com>2011-03-23 02:34:54 -0400
commit02e2407858fd62053bf60349c0e72cd1c7a4a60e (patch)
tree0ebdbddc97d3abbc675916010e7771065b70c137 /arch/x86
parent96e1c408ea8a556c5b51e0e7d56bd2afbfbf5fe9 (diff)
parent6447f55da90b77faec1697d499ed7986bb4f6de6 (diff)
Merge branch 'linus' into release
Conflicts: arch/x86/kernel/acpi/sleep.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig51
-rw-r--r--arch/x86/Kconfig.cpu9
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c24
-rw-r--r--arch/x86/ia32/ia32entry.S33
-rw-r--r--arch/x86/include/asm/acpi.h14
-rw-r--r--arch/x86/include/asm/amd_nb.h24
-rw-r--r--arch/x86/include/asm/apic.h42
-rw-r--r--arch/x86/include/asm/apicdef.h12
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/dma.h6
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h5
-rw-r--r--arch/x86/include/asm/frame.h6
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/hw_irq.h24
-rw-r--r--arch/x86/include/asm/init.h6
-rw-r--r--arch/x86/include/asm/io_apic.h44
-rw-r--r--arch/x86/include/asm/ipi.h8
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_controller.h12
-rw-r--r--arch/x86/include/asm/irq_vectors.h45
-rw-r--r--arch/x86/include/asm/kdebug.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h12
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nmi.h5
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/numa.h52
-rw-r--r--arch/x86/include/asm/numa_32.h7
-rw-r--r--arch/x86/include/asm/numa_64.h23
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h14
-rw-r--r--arch/x86/include/asm/page_types.h9
-rw-r--r--arch/x86/include/asm/percpu.h48
-rw-r--r--arch/x86/include/asm/perf_event_p4.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level.h11
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h70
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/rwsem.h80
-rw-r--r--arch/x86/include/asm/segment.h12
-rw-r--r--arch/x86/include/asm/smp.h20
-rw-r--r--arch/x86/include/asm/stacktrace.h6
-rw-r--r--arch/x86/include/asm/system.h2
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/topology.h19
-rw-r--r--arch/x86/include/asm/trampoline.h33
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/types.h8
-rw-r--r--arch/x86/include/asm/unistd_32.h6
-rw-r--r--arch/x86/include/asm/unistd_64.h8
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h15
-rw-r--r--arch/x86/include/asm/xen/interface.h2
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/asm/xen/pci.h8
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S21
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c64
-rw-r--r--arch/x86/kernel/acpi/sleep.h3
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/amd_nb.c102
-rw-r--r--arch/x86/kernel/apb_timer.c60
-rw-r--r--arch/x86/kernel/aperture_64.c35
-rw-r--r--arch/x86/kernel/apic/apic.c150
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c26
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c34
-rw-r--r--arch/x86/kernel/apic/es7000_32.c35
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c394
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/summit_32.c47
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/apm_32.c19
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c69
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/cpu/amd.c65
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c80
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c172
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c175
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c417
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c83
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/devicetree.c441
-rw-r--r--arch/x86/kernel/dumpstack.c49
-rw-r--r--arch/x86/kernel/dumpstack_32.c15
-rw-r--r--arch/x86/kernel/dumpstack_64.c14
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/early-quirks.c7
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S17
-rw-r--r--arch/x86/kernel/ftrace.c15
-rw-r--r--arch/x86/kernel/head32.c9
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S10
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c91
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irqinit.c92
-rw-r--r--arch/x86/kernel/kgdb.c11
-rw-r--r--arch/x86/kernel/kprobes.c8
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c188
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/mpparse.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/reboot.c120
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c66
-rw-r--r--arch/x86/kernel/setup_percpu.c11
-rw-r--r--arch/x86/kernel/smpboot.c134
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S28
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/verify_cpu.S2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S18
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c150
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c27
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/trace.h8
-rw-r--r--arch/x86/kvm/vmx.c128
-rw-r--r--arch/x86/kvm/x86.c155
-rw-r--r--arch/x86/lguest/boot.c6
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S6
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S6
-rw-r--r--arch/x86/lib/checksum_32.S63
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S59
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/csum-copy_64.S242
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/lib/memmove_64.S197
-rw-r--r--arch/x86/lib/memmove_64.c192
-rw-r--r--arch/x86/lib/rwsem_64.S56
-rw-r--r--arch/x86/lib/semaphore_32.S38
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S27
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology_64.c142
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init.c56
-rw-r--r--arch/x86/mm/init_32.c13
-rw-r--r--arch/x86/mm/init_64.c97
-rw-r--r--arch/x86/mm/numa.c212
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c984
-rw-r--r--arch/x86/mm/numa_emulation.c494
-rw-r--r--arch/x86/mm/numa_internal.h31
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c3
-rw-r--r--arch/x86/mm/srat_32.c6
-rw-r--r--arch/x86/mm/srat_64.c367
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/ce4100.c2
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/irq.c15
-rw-r--r--arch/x86/pci/xen.c192
-rw-r--r--arch/x86/platform/ce4100/ce4100.c24
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts428
-rw-r--r--arch/x86/platform/mrst/mrst.c2
-rw-r--r--arch/x86/platform/mrst/vrtc.c16
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/uv/uv_irq.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c4
-rw-r--r--arch/x86/xen/Kconfig10
-rw-r--r--arch/x86/xen/enlighten.c8
-rw-r--r--arch/x86/xen/mmu.c98
-rw-r--r--arch/x86/xen/p2m.c330
-rw-r--r--arch/x86/xen/setup.c68
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/suspend.c8
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
227 files changed, 6249 insertions, 3981 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d30aad..d57ddd7573cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,8 +64,12 @@ config X86
64 select HAVE_TEXT_POKE_SMP 64 select HAVE_TEXT_POKE_SMP
65 select HAVE_GENERIC_HARDIRQS 65 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
67 select GENERIC_FIND_FIRST_BIT
68 select GENERIC_FIND_NEXT_BIT
67 select GENERIC_IRQ_PROBE 69 select GENERIC_IRQ_PROBE
68 select GENERIC_PENDING_IRQ if SMP 70 select GENERIC_PENDING_IRQ if SMP
71 select GENERIC_IRQ_SHOW
72 select IRQ_FORCED_THREADING
69 select USE_GENERIC_SMP_HELPERS if SMP 73 select USE_GENERIC_SMP_HELPERS if SMP
70 74
71config INSTRUCTION_DECODER 75config INSTRUCTION_DECODER
@@ -119,7 +123,7 @@ config NEED_SG_DMA_LENGTH
119 def_bool y 123 def_bool y
120 124
121config GENERIC_ISA_DMA 125config GENERIC_ISA_DMA
122 def_bool y 126 def_bool ISA_DMA_API
123 127
124config GENERIC_IOMAP 128config GENERIC_IOMAP
125 def_bool y 129 def_bool y
@@ -139,7 +143,7 @@ config GENERIC_GPIO
139 bool 143 bool
140 144
141config ARCH_MAY_HAVE_PC_FDC 145config ARCH_MAY_HAVE_PC_FDC
142 def_bool y 146 def_bool ISA_DMA_API
143 147
144config RWSEM_GENERIC_SPINLOCK 148config RWSEM_GENERIC_SPINLOCK
145 def_bool !X86_XADD 149 def_bool !X86_XADD
@@ -217,10 +221,6 @@ config X86_HT
217 def_bool y 221 def_bool y
218 depends on SMP 222 depends on SMP
219 223
220config X86_TRAMPOLINE
221 def_bool y
222 depends on SMP || (64BIT && ACPI_SLEEP)
223
224config X86_32_LAZY_GS 224config X86_32_LAZY_GS
225 def_bool y 225 def_bool y
226 depends on X86_32 && !CC_STACKPROTECTOR 226 depends on X86_32 && !CC_STACKPROTECTOR
@@ -382,6 +382,8 @@ config X86_INTEL_CE
382 depends on X86_32 382 depends on X86_32
383 depends on X86_EXTENDED_PLATFORM 383 depends on X86_EXTENDED_PLATFORM
384 select X86_REBOOTFIXUPS 384 select X86_REBOOTFIXUPS
385 select OF
386 select OF_EARLY_FLATTREE
385 ---help--- 387 ---help---
386 Select for the Intel CE media processor (CE4100) SOC. 388 Select for the Intel CE media processor (CE4100) SOC.
387 This option compiles in support for the CE4100 SOC for settop 389 This option compiles in support for the CE4100 SOC for settop
@@ -811,7 +813,7 @@ config X86_LOCAL_APIC
811 813
812config X86_IO_APIC 814config X86_IO_APIC
813 def_bool y 815 def_bool y
814 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 816 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
815 817
816config X86_VISWS_APIC 818config X86_VISWS_APIC
817 def_bool y 819 def_bool y
@@ -1705,7 +1707,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1705 depends on NUMA 1707 depends on NUMA
1706 1708
1707config USE_PERCPU_NUMA_NODE_ID 1709config USE_PERCPU_NUMA_NODE_ID
1708 def_bool X86_64 1710 def_bool y
1709 depends on NUMA 1711 depends on NUMA
1710 1712
1711menu "Power management and ACPI options" 1713menu "Power management and ACPI options"
@@ -2000,9 +2002,13 @@ source "drivers/pci/pcie/Kconfig"
2000 2002
2001source "drivers/pci/Kconfig" 2003source "drivers/pci/Kconfig"
2002 2004
2003# x86_64 have no ISA slots, but do have ISA-style DMA. 2005# x86_64 have no ISA slots, but can have ISA-style DMA.
2004config ISA_DMA_API 2006config ISA_DMA_API
2005 def_bool y 2007 bool "ISA-style DMA support" if (X86_64 && EXPERT)
2008 default y
2009 help
2010 Enables ISA-style DMA support for devices requiring such controllers.
2011 If unsure, say Y.
2006 2012
2007if X86_32 2013if X86_32
2008 2014
@@ -2066,9 +2072,10 @@ config SCx200HR_TIMER
2066 2072
2067config OLPC 2073config OLPC
2068 bool "One Laptop Per Child support" 2074 bool "One Laptop Per Child support"
2075 depends on !X86_PAE
2069 select GPIOLIB 2076 select GPIOLIB
2070 select OLPC_OPENFIRMWARE 2077 select OF
2071 depends on !X86_64 && !X86_PAE 2078 select OF_PROMTREE if PROC_DEVICETREE
2072 ---help--- 2079 ---help---
2073 Add support for detecting the unique features of the OLPC 2080 Add support for detecting the unique features of the OLPC
2074 XO hardware. 2081 XO hardware.
@@ -2079,21 +2086,6 @@ config OLPC_XO1
2079 ---help--- 2086 ---help---
2080 Add support for non-essential features of the OLPC XO-1 laptop. 2087 Add support for non-essential features of the OLPC XO-1 laptop.
2081 2088
2082config OLPC_OPENFIRMWARE
2083 bool "Support for OLPC's Open Firmware"
2084 depends on !X86_64 && !X86_PAE
2085 default n
2086 select OF
2087 help
2088 This option adds support for the implementation of Open Firmware
2089 that is used on the OLPC XO-1 Children's Machine.
2090 If unsure, say N here.
2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2097endif # X86_32 2089endif # X86_32
2098 2090
2099config AMD_NB 2091config AMD_NB
@@ -2138,6 +2130,11 @@ config SYSVIPC_COMPAT
2138 def_bool y 2130 def_bool y
2139 depends on COMPAT && SYSVIPC 2131 depends on COMPAT && SYSVIPC
2140 2132
2133config KEYS_COMPAT
2134 bool
2135 depends on COMPAT && KEYS
2136 default y
2137
2141endmenu 2138endmenu
2142 2139
2143 2140
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 283c5a6a03a6..d161e939df62 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,11 +294,6 @@ config X86_GENERIC
294 294
295endif 295endif
296 296
297config X86_CPU
298 def_bool y
299 select GENERIC_FIND_FIRST_BIT
300 select GENERIC_FIND_NEXT_BIT
301
302# 297#
303# Define implied options from the CPU selection here 298# Define implied options from the CPU selection here
304config X86_INTERNODE_CACHE_SHIFT 299config X86_INTERNODE_CACHE_SHIFT
@@ -331,7 +326,7 @@ config X86_PPRO_FENCE
331 Old PentiumPro multiprocessor systems had errata that could cause 326 Old PentiumPro multiprocessor systems had errata that could cause
332 memory operations to violate the x86 ordering standard in rare cases. 327 memory operations to violate the x86 ordering standard in rare cases.
333 Enabling this option will attempt to work around some (but not all) 328 Enabling this option will attempt to work around some (but not all)
334 occurances of this problem, at the cost of much heavier spinlock and 329 occurrences of this problem, at the cost of much heavier spinlock and
335 memory barrier operations. 330 memory barrier operations.
336 331
337 If unsure, say n here. Even distro kernels should think twice before 332 If unsure, say n here. Even distro kernels should think twice before
@@ -371,7 +366,7 @@ config X86_INTEL_USERCOPY
371 366
372config X86_USE_PPRO_CHECKSUM 367config X86_USE_PPRO_CHECKSUM
373 def_bool y 368 def_bool y
374 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM 369 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
375 370
376config X86_USE_3DNOW 371config X86_USE_3DNOW
377 def_bool y 372 def_bool y
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 8fe2a4966b7a..adcf794b22e2 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt:
1346 and $15, %r13 # %r13 = arg4 (mod 16) 1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt 1347 je _multiple_of_16_bytes_decrypt
1348 1348
1349 # Handle the last <16 byte block seperately 1349 # Handle the last <16 byte block separately
1350 1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10 1352 movdqa SHUF_MASK(%rip), %xmm10
@@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt:
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11 1356 sub $16, %r11
1357 add %r13, %r11 1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block 1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12 1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12 1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
@@ -1607,7 +1607,7 @@ _zero_cipher_left_encrypt:
1607 and $15, %r13 # %r13 = arg4 (mod 16) 1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt 1608 je _multiple_of_16_bytes_encrypt
1609 1609
1610 # Handle the last <16 Byte block seperately 1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10 1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0 1613 PSHUFB_XMM %xmm10, %xmm0
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e1e60c7d5813..e0e6340c8dad 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -873,22 +873,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0); 873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874 874
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); 875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) { 876 if (ret)
877 crypto_free_ablkcipher(ctr_tfm); 877 goto out_free_ablkcipher;
878 return ret;
879 }
880 878
879 ret = -ENOMEM;
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); 880 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) { 881 if (!req)
883 crypto_free_ablkcipher(ctr_tfm); 882 goto out_free_ablkcipher;
884 return -EINVAL;
885 }
886 883
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); 884 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) { 885 if (!req_data)
889 crypto_free_ablkcipher(ctr_tfm); 886 goto out_free_request;
890 return -ENOMEM; 887
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv)); 888 memset(req_data->iv, 0, sizeof(req_data->iv));
893 889
894 /* Clear the data in the hash sub key container to zero.*/ 890 /* Clear the data in the hash sub key container to zero.*/
@@ -913,8 +909,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
913 if (!ret) 909 if (!ret)
914 ret = req_data->result.err; 910 ret = req_data->result.err;
915 } 911 }
916 ablkcipher_request_free(req);
917 kfree(req_data); 912 kfree(req_data);
913out_free_request:
914 ablkcipher_request_free(req);
915out_free_ablkcipher:
918 crypto_free_ablkcipher(ctr_tfm); 916 crypto_free_ablkcipher(ctr_tfm);
919 return ret; 917 return ret;
920} 918}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99c3394..849a9d23c71d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
25#define sysretl_audit ia32_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28 .section .entry.text, "ax"
29
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
29 31
30 .macro IA32_ARG_FIXUP noebp=0 32 .macro IA32_ARG_FIXUP noebp=0
@@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target)
126 */ 128 */
127 ENABLE_INTERRUPTS(CLBR_NONE) 129 ENABLE_INTERRUPTS(CLBR_NONE)
128 movl %ebp,%ebp /* zero extension */ 130 movl %ebp,%ebp /* zero extension */
129 pushq $__USER32_DS 131 pushq_cfi $__USER32_DS
130 CFI_ADJUST_CFA_OFFSET 8
131 /*CFI_REL_OFFSET ss,0*/ 132 /*CFI_REL_OFFSET ss,0*/
132 pushq %rbp 133 pushq_cfi %rbp
133 CFI_ADJUST_CFA_OFFSET 8
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq 135 pushfq_cfi
136 CFI_ADJUST_CFA_OFFSET 8
137 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
138 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
139 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
140 pushq $__USER32_CS 139 pushq_cfi $__USER32_CS
141 CFI_ADJUST_CFA_OFFSET 8
142 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
143 movl %eax, %eax 141 movl %eax, %eax
144 pushq %r10 142 pushq_cfi %r10
145 CFI_ADJUST_CFA_OFFSET 8
146 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
147 pushq %rax 144 pushq_cfi %rax
148 CFI_ADJUST_CFA_OFFSET 8
149 cld 145 cld
150 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,0,1
151 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
@@ -182,11 +178,9 @@ sysexit_from_sys_call:
182 xorq %r9,%r9 178 xorq %r9,%r9
183 xorq %r10,%r10 179 xorq %r10,%r10
184 xorq %r11,%r11 180 xorq %r11,%r11
185 popfq 181 popfq_cfi
186 CFI_ADJUST_CFA_OFFSET -8
187 /*CFI_RESTORE rflags*/ 182 /*CFI_RESTORE rflags*/
188 popq %rcx /* User %esp */ 183 popq_cfi %rcx /* User %esp */
189 CFI_ADJUST_CFA_OFFSET -8
190 CFI_REGISTER rsp,rcx 184 CFI_REGISTER rsp,rcx
191 TRACE_IRQS_ON 185 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS_SYSEXIT32 186 ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +415,7 @@ ENTRY(ia32_syscall)
421 */ 415 */
422 ENABLE_INTERRUPTS(CLBR_NONE) 416 ENABLE_INTERRUPTS(CLBR_NONE)
423 movl %eax,%eax 417 movl %eax,%eax
424 pushq %rax 418 pushq_cfi %rax
425 CFI_ADJUST_CFA_OFFSET 8
426 cld 419 cld
427 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
428 this could be a problem. */ 421 this could be a problem. */
@@ -851,4 +844,8 @@ ia32_sys_call_table:
851 .quad sys_fanotify_init 844 .quad sys_fanotify_init
852 .quad sys32_fanotify_mark 845 .quad sys32_fanotify_mark
853 .quad sys_prlimit64 /* 340 */ 846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
854ia32_syscall_end: 851ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index ef14da1f4ec5..12e0e7dd869c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h>
32 33
33#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -116,7 +117,8 @@ static inline void acpi_disable_pci(void)
116/* Low-level suspend routine. */ 117/* Low-level suspend routine. */
117extern int acpi_suspend_lowlevel(void); 118extern int acpi_suspend_lowlevel(void);
118 119
119extern unsigned long acpi_wakeup_address; 120extern const unsigned char acpi_wakeup_code[];
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
120 122
121/* early initialization routine */ 123/* early initialization routine */
122extern void acpi_reserve_wakeup_memory(void); 124extern void acpi_reserve_wakeup_memory(void);
@@ -185,15 +187,7 @@ struct bootnode;
185 187
186#ifdef CONFIG_ACPI_NUMA 188#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 189extern int acpi_numa;
188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 190extern int x86_acpi_numa_init(void);
189 unsigned long end);
190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
192
193#ifdef CONFIG_NUMA_EMU
194extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
195 int num_nodes);
196#endif
197#endif /* CONFIG_ACPI_NUMA */ 191#endif /* CONFIG_ACPI_NUMA */
198 192
199#define acpi_unlazy_tlb(x) leave_mm(x) 193#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 64dc82ee19f0..331682231bb4 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -9,23 +9,20 @@ struct amd_nb_bus_dev_range {
9 u8 dev_limit; 9 u8 dev_limit;
10}; 10};
11 11
12extern struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode; 14struct bootnode;
15 15
16extern int early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
18extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 19extern int amd_numa_init(void);
20extern int amd_scan_nodes(void); 20extern int amd_get_subcaches(int);
21 21extern int amd_set_subcaches(int, int);
22#ifdef CONFIG_NUMA_EMU
23extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
24extern void amd_get_nodes(struct bootnode *nodes);
25#endif
26 22
27struct amd_northbridge { 23struct amd_northbridge {
28 struct pci_dev *misc; 24 struct pci_dev *misc;
25 struct pci_dev *link;
29}; 26};
30 27
31struct amd_northbridge_info { 28struct amd_northbridge_info {
@@ -35,17 +32,18 @@ struct amd_northbridge_info {
35}; 32};
36extern struct amd_northbridge_info amd_northbridges; 33extern struct amd_northbridge_info amd_northbridges;
37 34
38#define AMD_NB_GART 0x1 35#define AMD_NB_GART BIT(0)
39#define AMD_NB_L3_INDEX_DISABLE 0x2 36#define AMD_NB_L3_INDEX_DISABLE BIT(1)
37#define AMD_NB_L3_PARTITIONING BIT(2)
40 38
41#ifdef CONFIG_AMD_NB 39#ifdef CONFIG_AMD_NB
42 40
43static inline int amd_nb_num(void) 41static inline u16 amd_nb_num(void)
44{ 42{
45 return amd_northbridges.num; 43 return amd_northbridges.num;
46} 44}
47 45
48static inline int amd_nb_has_feature(int feature) 46static inline bool amd_nb_has_feature(unsigned feature)
49{ 47{
50 return ((amd_northbridges.flags & feature) == feature); 48 return ((amd_northbridges.flags & feature) == feature);
51} 49}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3c896946f4cc..a279d98ea95e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -220,7 +220,6 @@ extern void enable_IR_x2apic(void);
220 220
221extern int get_physical_broadcast(void); 221extern int get_physical_broadcast(void);
222 222
223extern void apic_disable(void);
224extern int lapic_get_maxlvt(void); 223extern int lapic_get_maxlvt(void);
225extern void clear_local_APIC(void); 224extern void clear_local_APIC(void);
226extern void connect_bsp_APIC(void); 225extern void connect_bsp_APIC(void);
@@ -228,7 +227,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
228extern void disable_local_APIC(void); 227extern void disable_local_APIC(void);
229extern void lapic_shutdown(void); 228extern void lapic_shutdown(void);
230extern int verify_local_APIC(void); 229extern int verify_local_APIC(void);
231extern void cache_APIC_registers(void);
232extern void sync_Arb_IDs(void); 230extern void sync_Arb_IDs(void);
233extern void init_bsp_APIC(void); 231extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 232extern void setup_local_APIC(void);
@@ -239,8 +237,7 @@ void register_lapic_address(unsigned long address);
239extern void setup_boot_APIC_clock(void); 237extern void setup_boot_APIC_clock(void);
240extern void setup_secondary_APIC_clock(void); 238extern void setup_secondary_APIC_clock(void);
241extern int APIC_init_uniprocessor(void); 239extern int APIC_init_uniprocessor(void);
242extern void enable_NMI_through_LVT0(void); 240extern int apic_force_enable(unsigned long addr);
243extern int apic_force_enable(void);
244 241
245/* 242/*
246 * On 32bit this is mach-xxx local 243 * On 32bit this is mach-xxx local
@@ -261,7 +258,6 @@ static inline void lapic_shutdown(void) { }
261#define local_apic_timer_c2_ok 1 258#define local_apic_timer_c2_ok 1
262static inline void init_apic_mappings(void) { } 259static inline void init_apic_mappings(void) { }
263static inline void disable_local_APIC(void) { } 260static inline void disable_local_APIC(void) { }
264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop 261# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop 262# define setup_secondary_APIC_clock x86_init_noop
267#endif /* !CONFIG_X86_LOCAL_APIC */ 263#endif /* !CONFIG_X86_LOCAL_APIC */
@@ -307,8 +303,6 @@ struct apic {
307 303
308 void (*setup_apic_routing)(void); 304 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 305 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 306 int (*cpu_present_to_apicid)(int mps_cpu);
313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); 307 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 308 void (*setup_portio_remap)(void);
@@ -356,6 +350,23 @@ struct apic {
356 void (*icr_write)(u32 low, u32 high); 350 void (*icr_write)(u32 low, u32 high);
357 void (*wait_icr_idle)(void); 351 void (*wait_icr_idle)(void);
358 u32 (*safe_wait_icr_idle)(void); 352 u32 (*safe_wait_icr_idle)(void);
353
354#ifdef CONFIG_X86_32
355 /*
356 * Called very early during boot from get_smp_config(). It should
357 * return the logical apicid. x86_[bios]_cpu_to_apicid is
358 * initialized before this function is called.
359 *
360 * If logical apicid can't be determined that early, the function
361 * may return BAD_APICID. Logical apicid will be configured after
362 * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
363 * won't be applied properly during early boot in this case.
364 */
365 int (*x86_32_early_logical_apicid)(int cpu);
366
367 /* determine CPU -> NUMA node mapping */
368 int (*x86_32_numa_cpu_node)(int cpu);
369#endif
359}; 370};
360 371
361/* 372/*
@@ -503,6 +514,11 @@ extern struct apic apic_noop;
503 514
504extern struct apic apic_default; 515extern struct apic apic_default;
505 516
517static inline int noop_x86_32_early_logical_apicid(int cpu)
518{
519 return BAD_APICID;
520}
521
506/* 522/*
507 * Set up the logical destination ID. 523 * Set up the logical destination ID.
508 * 524 *
@@ -522,7 +538,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
522 return cpuid_apic >> index_msb; 538 return cpuid_apic >> index_msb;
523} 539}
524 540
525extern int default_apicid_to_node(int logical_apicid); 541extern int default_x86_32_numa_cpu_node(int cpu);
526 542
527#endif 543#endif
528 544
@@ -558,12 +574,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
558 *retmap = *phys_map; 574 *retmap = *phys_map;
559} 575}
560 576
561/* Mapping from cpu number to logical apicid */
562static inline int default_cpu_to_logical_apicid(int cpu)
563{
564 return 1 << cpu;
565}
566
567static inline int __default_cpu_present_to_apicid(int mps_cpu) 577static inline int __default_cpu_present_to_apicid(int mps_cpu)
568{ 578{
569 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) 579 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -596,8 +606,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
596 606
597#endif /* CONFIG_X86_LOCAL_APIC */ 607#endif /* CONFIG_X86_LOCAL_APIC */
598 608
599#ifdef CONFIG_X86_32
600extern u8 cpu_2_logical_apicid[NR_CPUS];
601#endif
602
603#endif /* _ASM_X86_APIC_H */ 609#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 47a30ff8e517..d87988bacf3e 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -426,4 +426,16 @@ struct local_apic {
426#else 426#else
427 #define BAD_APICID 0xFFFFu 427 #define BAD_APICID 0xFFFFu
428#endif 428#endif
429
430enum ioapic_irq_destination_types {
431 dest_Fixed = 0,
432 dest_LowestPrio = 1,
433 dest_SMI = 2,
434 dest__reserved_1 = 3,
435 dest_NMI = 4,
436 dest_INIT = 5,
437 dest__reserved_2 = 6,
438 dest_ExtINT = 7
439};
440
429#endif /* _ASM_X86_APICDEF_H */ 441#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index c8bfe63a06de..e020d88ec02d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
12/* setup data types */ 12/* setup data types */
13#define SETUP_NONE 0 13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1 14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
15 16
16/* extensible setup data list node */ 17/* extensible setup data list node */
17struct setup_data { 18struct setup_data {
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 62f084478f7e..4e12668711e5 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -71,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
71 * Read/Write : ReadOnly, ReadWrite 71 * Read/Write : ReadOnly, ReadWrite
72 * Presence : NotPresent 72 * Presence : NotPresent
73 * 73 *
74 * Within a catagory, the attributes are mutually exclusive. 74 * Within a category, the attributes are mutually exclusive.
75 * 75 *
76 * The implementation of this API will take care of various aspects that 76 * The implementation of this API will take care of various aspects that
77 * are associated with changing such attributes, such as: 77 * are associated with changing such attributes, such as:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 220e2ea08e80..91f3e087cf21 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -160,6 +160,7 @@
160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ 161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ 162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
163#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
163 164
164/* 165/*
165 * Auxiliary flags: Linux defined - For features scattered in various 166 * Auxiliary flags: Linux defined - For features scattered in various
@@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32];
279#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 280#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
280#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 281#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
281#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 282#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
283#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
282 284
283#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 285#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
284# define cpu_has_invlpg 1 286# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..97b6d8114a43 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -151,6 +151,7 @@
151#define DMA_AUTOINIT 0x10 151#define DMA_AUTOINIT 0x10
152 152
153 153
154#ifdef CONFIG_ISA_DMA_API
154extern spinlock_t dma_spin_lock; 155extern spinlock_t dma_spin_lock;
155 156
156static inline unsigned long claim_dma_lock(void) 157static inline unsigned long claim_dma_lock(void)
@@ -164,6 +165,7 @@ static inline void release_dma_lock(unsigned long flags)
164{ 165{
165 spin_unlock_irqrestore(&dma_spin_lock, flags); 166 spin_unlock_irqrestore(&dma_spin_lock, flags);
166} 167}
168#endif /* CONFIG_ISA_DMA_API */
167 169
168/* enable/disable a specific DMA channel */ 170/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr) 171static inline void enable_dma(unsigned int dmanr)
@@ -303,9 +305,11 @@ static inline int get_dma_residue(unsigned int dmanr)
303} 305}
304 306
305 307
306/* These are in kernel/dma.c: */ 308/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
309#ifdef CONFIG_ISA_DMA_API
307extern int request_dma(unsigned int dmanr, const char *device_id); 310extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr); 311extern void free_dma(unsigned int dmanr);
312#endif
309 313
310/* From PCI */ 314/* From PCI */
311 315
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index e99d55d74df5..908b96957d88 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -96,7 +96,7 @@ extern void e820_setup_gap(void);
96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
97 unsigned long start_addr, unsigned long long end_addr); 97 unsigned long start_addr, unsigned long long end_addr);
98struct setup_data; 98struct setup_data;
99extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); 99extern void parse_e820_ext(struct setup_data *data);
100 100
101#if defined(CONFIG_X86_64) || \ 101#if defined(CONFIG_X86_64) || \
102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 57650ab4a5f5..1cd6d26a0a8d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18 18
19.irpc idx, "01234567" 19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
20BUILD_INTERRUPT3(invalidate_interrupt\idx, 22BUILD_INTERRUPT3(invalidate_interrupt\idx,
21 (INVALIDATE_TLB_VECTOR_START)+\idx, 23 (INVALIDATE_TLB_VECTOR_START)+\idx,
22 smp_invalidate_interrupt) 24 smp_invalidate_interrupt)
25.endif
23.endr 26.endr
24#endif 27#endif
25 28
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e1..2c6fc9e62812 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
7 frame pointer later */ 7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 9 .macro FRAME
10 pushl %ebp 10 pushl_cfi %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0 11 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp 12 movl %esp,%ebp
14 .endm 13 .endm
15 .macro ENDFRAME 14 .macro ENDFRAME
16 popl %ebp 15 popl_cfi %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp 16 CFI_RESTORE ebp
19 .endm 17 .endm
20#else 18#else
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e956..d09bb03653f0 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
37 "+m" (*uaddr), "=&r" (tem) \ 37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0)) 38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39 39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 40static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
41{ 41{
42 int op = (encoded_op >> 28) & 7; 42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15; 43 int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg; 49 oparg = 1 << oparg;
50 50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
52 return -EFAULT; 52 return -EFAULT;
53 53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
109 return ret; 109 return ret;
110} 110}
111 111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 112static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 int newval) 113 u32 oldval, u32 newval)
114{ 114{
115 int ret = 0;
115 116
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 117#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */ 118 /* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
119 return -ENOSYS; 120 return -ENOSYS;
120#endif 121#endif
121 122
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 123 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
123 return -EFAULT; 124 return -EFAULT;
124 125
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" 126 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
126 "2:\t.section .fixup, \"ax\"\n" 127 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n" 128 "3:\tmov %3, %0\n"
128 "\tjmp 2b\n" 129 "\tjmp 2b\n"
129 "\t.previous\n" 130 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b) 131 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr) 132 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval) 133 : "i" (-EFAULT), "r" (newval), "1" (oldval)
133 : "memory" 134 : "memory"
134 ); 135 );
135 136
136 return oldval; 137 *uval = oldval;
138 return ret;
137} 139}
138 140
139#endif 141#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 0274ec5a7e62..bb9efe8706e2 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void);
48 72
49extern void irq_move_cleanup_interrupt(void); 73extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void); 74extern void reboot_interrupt(void);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a5109..8dbe353e41e1 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
11 unsigned long page_size_mask); 11 unsigned long page_size_mask);
12 12
13 13
14extern unsigned long __initdata e820_table_start; 14extern unsigned long __initdata pgt_buf_start;
15extern unsigned long __meminitdata e820_table_end; 15extern unsigned long __meminitdata pgt_buf_end;
16extern unsigned long __meminitdata e820_table_top; 16extern unsigned long __meminitdata pgt_buf_top;
17 17
18#endif /* _ASM_X86_INIT_32_H */ 18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index f327d386d6cc..c4bd267dfc50 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
63 } __attribute__ ((packed)) bits; 63 } __attribute__ ((packed)) bits;
64}; 64};
65 65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry { 66struct IO_APIC_route_entry {
78 __u32 vector : 8, 67 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED 68 delivery_mode : 3, /* 000: FIXED
@@ -106,6 +95,10 @@ struct IR_IO_APIC_route_entry {
106 index : 15; 95 index : 15;
107} __attribute__ ((packed)); 96} __attribute__ ((packed));
108 97
98#define IOAPIC_AUTO -1
99#define IOAPIC_EDGE 0
100#define IOAPIC_LEVEL 1
101
109#ifdef CONFIG_X86_IO_APIC 102#ifdef CONFIG_X86_IO_APIC
110 103
111/* 104/*
@@ -150,11 +143,6 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 143#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 144 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 145
153extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic);
157
158struct io_apic_irq_attr; 146struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 147extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 148 struct io_apic_irq_attr *irq_attr);
@@ -162,6 +150,8 @@ void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_and_gsi_init(void); 150extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
164 152
153int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -186,6 +176,8 @@ extern void __init pre_init_apic_IRQ0(void);
186 176
187extern void mp_save_irq(struct mpc_intsrc *m); 177extern void mp_save_irq(struct mpc_intsrc *m);
188 178
179extern void disable_ioapic_support(void);
180
189#else /* !CONFIG_X86_IO_APIC */ 181#else /* !CONFIG_X86_IO_APIC */
190 182
191#define io_apic_assign_pci_irqs 0 183#define io_apic_assign_pci_irqs 0
@@ -199,6 +191,26 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
199struct io_apic_irq_attr; 191struct io_apic_irq_attr;
200static inline int io_apic_set_pci_routing(struct device *dev, int irq, 192static inline int io_apic_set_pci_routing(struct device *dev, int irq,
201 struct io_apic_irq_attr *irq_attr) { return 0; } 193 struct io_apic_irq_attr *irq_attr) { return 0; }
194
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{
203 return -ENOMEM;
204}
205
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
208{
209 return -ENOMEM;
210}
211
212static inline void mp_save_irq(struct mpc_intsrc *m) { };
213static inline void disable_ioapic_support(void) { }
202#endif 214#endif
203 215
204#endif /* _ASM_X86_IO_APIC_H */ 216#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b7228268a63..615fa9061b57 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
123 int vector); 123 int vector);
124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
125 int vector); 125 int vector);
126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
130 126
131/* Avoid include hell */ 127/* Avoid include hell */
132#define NMI_VECTOR 0x02 128#define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
150} 146}
151 147
152#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
149extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
150 int vector);
151extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
152 int vector);
153extern void default_send_IPI_mask_logical(const struct cpumask *mask, 153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector); 154 int vector);
155extern void default_send_IPI_allbutself(int vector); 155extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index c704b38c57a2..ba870bb6dd8e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,9 +10,6 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
16static inline int irq_canonicalize(int irq) 13static inline int irq_canonicalize(int irq)
17{ 14{
18 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 000000000000..423bbbddf36d
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
1#ifndef __IRQ_CONTROLLER__
2#define __IRQ_CONTROLLER__
3
4struct irq_domain {
5 int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
6 u32 *out_hwirq, u32 *out_type);
7 void *priv;
8 struct device_node *controller;
9 struct list_head l;
10};
11
12#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6af0894dafb4..6e976ee3b3ef 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h>
4/* 5/*
5 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
6 * 7 *
@@ -16,8 +17,8 @@
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
20 * Vectors 238 ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
21 * 22 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 * 24 *
@@ -96,37 +97,43 @@
96#define THRESHOLD_APIC_VECTOR 0xf9 97#define THRESHOLD_APIC_VECTOR 0xf9
97#define REBOOT_VECTOR 0xf8 98#define REBOOT_VECTOR 0xf8
98 99
99/* f0-f7 used for spreading out TLB flushes: */
100#define INVALIDATE_TLB_VECTOR_END 0xf7
101#define INVALIDATE_TLB_VECTOR_START 0xf0
102#define NUM_INVALIDATE_TLB_VECTORS 8
103
104/*
105 * Local APIC timer IRQ vector is on a different priority level,
106 * to work around the 'lost local interrupt if more than 2 IRQ
107 * sources per level' errata.
108 */
109#define LOCAL_TIMER_VECTOR 0xef
110
111/* 100/*
112 * Generic system vector for platform specific use 101 * Generic system vector for platform specific use
113 */ 102 */
114#define X86_PLATFORM_IPI_VECTOR 0xed 103#define X86_PLATFORM_IPI_VECTOR 0xf7
115 104
116/* 105/*
117 * IRQ work vector: 106 * IRQ work vector:
118 */ 107 */
119#define IRQ_WORK_VECTOR 0xec 108#define IRQ_WORK_VECTOR 0xf6
120 109
121#define UV_BAU_MESSAGE 0xea 110#define UV_BAU_MESSAGE 0xf5
122 111
123/* 112/*
124 * Self IPI vector for machine checks 113 * Self IPI vector for machine checks
125 */ 114 */
126#define MCE_SELF_VECTOR 0xeb 115#define MCE_SELF_VECTOR 0xf4
127 116
128/* Xen vector callback to receive events in a HVM domain */ 117/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9 118#define XEN_HVM_EVTCHN_CALLBACK 0xf3
119
120/*
121 * Local APIC timer IRQ vector is on a different priority level,
122 * to work around the 'lost local interrupt if more than 2 IRQ
123 * sources per level' errata.
124 */
125#define LOCAL_TIMER_VECTOR 0xef
126
127/* up to 32 vectors used for spreading out TLB flushes: */
128#if NR_CPUS <= 32
129# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
130#else
131# define NUM_INVALIDATE_TLB_VECTORS (32)
132#endif
133
134#define INVALIDATE_TLB_VECTOR_END (0xee)
135#define INVALIDATE_TLB_VECTOR_START \
136 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
130 137
131#define NR_VECTORS 256 138#define NR_VECTORS 256
132 139
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index ca242d35e873..fe2cc6e105fa 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,7 +13,6 @@ enum die_val {
13 DIE_PANIC, 13 DIE_PANIC,
14 DIE_NMI, 14 DIE_NMI,
15 DIE_DIE, 15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG, 16 DIE_KERNELDEBUG,
18 DIE_TRAP, 17 DIE_TRAP,
19 DIE_GPF, 18 DIE_GPF,
@@ -27,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
27extern int __must_check __die(const char *, struct pt_regs *, long); 26extern int __must_check __die(const char *, struct pt_regs *, long);
28extern void show_registers(struct pt_regs *regs); 27extern void show_registers(struct pt_regs *regs);
29extern void show_trace(struct task_struct *t, struct pt_regs *regs, 28extern void show_trace(struct task_struct *t, struct pt_regs *regs,
30 unsigned long *sp); 29 unsigned long *sp, unsigned long bp);
31extern void __show_regs(struct pt_regs *regs, int all); 30extern void __show_regs(struct pt_regs *regs, int all);
32extern void show_regs(struct pt_regs *regs); 31extern void show_regs(struct pt_regs *regs);
33extern unsigned long oops_begin(void); 32extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8e37deb1eb38..0f5213564326 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 142 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
143 unsigned int count, struct kvm_vcpu *vcpu); 143 unsigned int count, struct kvm_vcpu *vcpu);
144 144
145 bool (*get_cached_descriptor)(struct desc_struct *desc, 145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
146 int seg, struct kvm_vcpu *vcpu); 146 int seg, struct kvm_vcpu *vcpu);
147 void (*set_cached_descriptor)(struct desc_struct *desc, 147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
148 int seg, struct kvm_vcpu *vcpu); 148 int seg, struct kvm_vcpu *vcpu);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
239 int interruptibility; 239 int interruptibility;
240 240
241 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn;
242 243
243 bool have_exception; 244 bool have_exception;
244 struct x86_exception exception; 245 struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d29187..c8af0991fdf0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
85 85
86#define ASYNC_PF_PER_VCPU 64 86#define ASYNC_PF_PER_VCPU 64
87 87
88extern spinlock_t kvm_lock; 88extern raw_spinlock_t kvm_lock;
89extern struct list_head vm_list; 89extern struct list_head vm_list;
90 90
91struct kvm_vcpu; 91struct kvm_vcpu;
@@ -255,6 +255,8 @@ struct kvm_mmu {
255 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
256 struct kvm_mmu_page *sp); 256 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq);
258 hpa_t root_hpa; 260 hpa_t root_hpa;
259 int root_level; 261 int root_level;
260 int shadow_root_level; 262 int shadow_root_level;
@@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
335 u64 *last_pte_updated; 337 u64 *last_pte_updated;
336 gfn_t last_pte_gfn; 338 gfn_t last_pte_gfn;
337 339
338 struct {
339 gfn_t gfn; /* presumed gfn during guest pte update */
340 pfn_t pfn; /* pfn corresponding to that gfn */
341 unsigned long mmu_seq;
342 } update_pte;
343
344 struct fpu guest_fpu; 340 struct fpu guest_fpu;
345 u64 xcr0; 341 u64 xcr0;
346 342
@@ -448,7 +444,7 @@ struct kvm_arch {
448 444
449 unsigned long irq_sources_bitmap; 445 unsigned long irq_sources_bitmap;
450 s64 kvmclock_offset; 446 s64 kvmclock_offset;
451 spinlock_t tsc_write_lock; 447 raw_spinlock_t tsc_write_lock;
452 u64 last_tsc_nsec; 448 u64 last_tsc_nsec;
453 u64 last_tsc_offset; 449 u64 last_tsc_offset;
454 u64 last_tsc_write; 450 u64 last_tsc_write;
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 0c90dd9f0505..9c7d95f6174b 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -25,7 +25,6 @@ extern int pic_mode;
25#define MAX_IRQ_SOURCES 256 25#define MAX_IRQ_SOURCES 256
26 26
27extern unsigned int def_to_bigsmp; 27extern unsigned int def_to_bigsmp;
28extern u8 apicid_2_node[];
29 28
30#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
31extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
33extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
34#endif 33#endif
35 34
36#define MAX_APICID 256
37
38#else /* CONFIG_X86_64: */ 35#else /* CONFIG_X86_64: */
39 36
40#define MAX_MP_BUSSES 256 37#define MAX_MP_BUSSES 256
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 43a18c77676d..fd5a1f365c95 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -43,6 +43,7 @@
43 43
44#define MSR_MTRRcap 0x000000fe 44#define MSR_MTRRcap 0x000000fe
45#define MSR_IA32_BBL_CR_CTL 0x00000119 45#define MSR_IA32_BBL_CR_CTL 0x00000119
46#define MSR_IA32_BBL_CR_CTL3 0x0000011e
46 47
47#define MSR_IA32_SYSENTER_CS 0x00000174 48#define MSR_IA32_SYSENTER_CS 0x00000174
48#define MSR_IA32_SYSENTER_ESP 0x00000175 49#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -52,6 +53,9 @@
52#define MSR_IA32_MCG_STATUS 0x0000017a 53#define MSR_IA32_MCG_STATUS 0x0000017a
53#define MSR_IA32_MCG_CTL 0x0000017b 54#define MSR_IA32_MCG_CTL 0x0000017b
54 55
56#define MSR_OFFCORE_RSP_0 0x000001a6
57#define MSR_OFFCORE_RSP_1 0x000001a7
58
55#define MSR_IA32_PEBS_ENABLE 0x000003f1 59#define MSR_IA32_PEBS_ENABLE 0x000003f1
56#define MSR_IA32_DS_AREA 0x00000600 60#define MSR_IA32_DS_AREA 0x00000600
57#define MSR_IA32_PERF_CAPABILITIES 0x00000345 61#define MSR_IA32_PERF_CAPABILITIES 0x00000345
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c76f5b92b840..4886a68f267e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,7 +7,6 @@
7 7
8#ifdef CONFIG_X86_LOCAL_APIC 8#ifdef CONFIG_X86_LOCAL_APIC
9 9
10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 10extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
12extern int reserve_perfctr_nmi(unsigned int); 11extern int reserve_perfctr_nmi(unsigned int);
13extern void release_perfctr_nmi(unsigned int); 12extern void release_perfctr_nmi(unsigned int);
@@ -30,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void);
30 * external nmis, because the local ones are more frequent. 29 * external nmis, because the local ones are more frequent.
31 * 30 *
32 * Also setup some default high/normal/low settings for 31 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate 32 * subsystems to registers with. Using 4 bits to separate
34 * the priorities. This can go alot higher if needed be. 33 * the priorities. This can go a lot higher if needed be.
35 */ 34 */
36 35
37#define NMI_LOCAL_SHIFT 16 /* randomly picked */ 36#define NMI_LOCAL_SHIFT 16 /* randomly picked */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 6d8723a766cc..af788496020b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -38,7 +38,7 @@
38#define K8_NOP8 K8_NOP4 K8_NOP4 38#define K8_NOP8 K8_NOP4 K8_NOP4
39 39
40/* K7 nops 40/* K7 nops
41 uses eax dependencies (arbitary choice) 41 uses eax dependencies (arbitrary choice)
42 1: nop 42 1: nop
43 2: movl %eax,%eax 43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax 44 3: leal (,%eax,1),%eax
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d3138..3d4dab43c994 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,57 @@
1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H
3
4#include <asm/topology.h>
5#include <asm/apicdef.h>
6
7#ifdef CONFIG_NUMA
8
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
10
11/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and
13 * node and is used to initialize cpu_to_node mapping.
14 *
15 * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
16 * should be accessed by the accessors - set_apicid_to_node() and
17 * numa_cpu_node().
18 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC];
20
21static inline void set_apicid_to_node(int apicid, s16 node)
22{
23 __apicid_to_node[apicid] = node;
24}
25#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node)
27{
28}
29#endif /* CONFIG_NUMA */
30
1#ifdef CONFIG_X86_32 31#ifdef CONFIG_X86_32
2# include "numa_32.h" 32# include "numa_32.h"
3#else 33#else
4# include "numa_64.h" 34# include "numa_64.h"
5#endif 35#endif
36
37#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif /* CONFIG_NUMA */
52
53#ifdef CONFIG_DEBUG_PER_CPU_MAPS
54struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
55#endif
56
57#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index b0ef2b449a9d..c6beed1ef103 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,7 +4,12 @@
4extern int numa_off; 4extern int numa_off;
5 5
6extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
7extern void numa_remove_cpu(int cpu); 7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
8 13
9#ifdef CONFIG_HIGHMEM 14#ifdef CONFIG_HIGHMEM
10extern void set_highmem_pages_init(void); 15extern void set_highmem_pages_init(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 0493be39607c..344eb1790b46 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -2,23 +2,16 @@
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h> 4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6 5
7struct bootnode { 6struct bootnode {
8 u64 start; 7 u64 start;
9 u64 end; 8 u64 end;
10}; 9};
11 10
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16 12
17extern void numa_init_array(void);
18extern int numa_off; 13extern int numa_off;
19 14
20extern s16 apicid_to_node[MAX_LOCAL_APIC];
21
22extern unsigned long numa_free_all_bootmem(void); 15extern unsigned long numa_free_all_bootmem(void);
23extern void setup_node_bootmem(int nodeid, unsigned long start, 16extern void setup_node_bootmem(int nodeid, unsigned long start,
24 unsigned long end); 17 unsigned long end);
@@ -31,11 +24,11 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
31 */ 24 */
32#define NODE_MIN_SIZE (4*1024*1024) 25#define NODE_MIN_SIZE (4*1024*1024)
33 26
34extern void __init init_cpu_to_node(void); 27extern nodemask_t numa_nodes_parsed __initdata;
35extern void __cpuinit numa_set_node(int cpu, int node); 28
36extern void __cpuinit numa_clear_node(int cpu); 29extern int __cpuinit numa_cpu_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
38extern void __cpuinit numa_remove_cpu(int cpu); 31extern void __init numa_set_distance(int from, int to, int distance);
39 32
40#ifdef CONFIG_NUMA_EMU 33#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
@@ -43,11 +36,7 @@ extern void __cpuinit numa_remove_cpu(int cpu);
43void numa_emu_cmdline(char *); 36void numa_emu_cmdline(char *);
44#endif /* CONFIG_NUMA_EMU */ 37#endif /* CONFIG_NUMA_EMU */
45#else 38#else
46static inline void init_cpu_to_node(void) { } 39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
47static inline void numa_set_node(int cpu, int node) { }
48static inline void numa_clear_node(int cpu) { }
49static inline void numa_add_cpu(int cpu, int node) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif 40#endif
52 41
53#endif /* _ASM_X86_NUMA_64_H */ 42#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index f482010350fb..5ca6801b75f3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
20 20
21/* 21/*
22 * OLPC board IDs contain the major build number within the mask 0x0ff0, 22 * OLPC board IDs contain the major build number within the mask 0x0ff0,
23 * and the minor build number withing 0x000f. Pre-builds have a minor 23 * and the minor build number within 0x000f. Pre-builds have a minor
24 * number less than 8, and normal builds start at 8. For example, 0x0B10 24 * number less than 8, and normal builds start at 8. For example, 0x0B10
25 * is a PreB1, and 0x0C18 is a C1. 25 * is a PreB1, and 0x0C18 is a C1.
26 */ 26 */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 641988efe063..c5d3a5abbb9f 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,7 +6,7 @@
6 6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ 7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC
10 10
11extern bool olpc_ofw_is_installed(void); 11extern bool olpc_ofw_is_installed(void);
12 12
@@ -26,19 +26,15 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC */
30
31static inline bool olpc_ofw_is_installed(void) { return false; }
32static inline void olpc_ofw_detect(void) { } 30static inline void olpc_ofw_detect(void) { }
33static inline void setup_olpc_ofw_pgd(void) { } 31static inline void setup_olpc_ofw_pgd(void) { }
34static inline bool olpc_ofw_present(void) { return false; } 32#endif /* !CONFIG_OLPC */
35
36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
37 33
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT 34#ifdef CONFIG_OF_PROMTREE
39extern void olpc_dt_build_devicetree(void); 35extern void olpc_dt_build_devicetree(void);
40#else 36#else
41static inline void olpc_dt_build_devicetree(void) { } 37static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ 38#endif
43 39
44#endif /* _ASM_X86_OLPC_OFW_H */ 40#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1df66211fd1b..bce688d54c12 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAGE_DEFS_H 2#define _ASM_X86_PAGE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h>
5 6
6/* PAGE_SHIFT determines the page size */ 7/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12 8#define PAGE_SHIFT 12
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr);
45extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
46extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
47 48
49static inline phys_addr_t get_max_mapped(void)
50{
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52}
53
48extern unsigned long init_memory_mapping(unsigned long start, 54extern unsigned long init_memory_mapping(unsigned long start,
49 unsigned long end); 55 unsigned long end);
50 56
51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, 57extern void initmem_init(void);
52 int acpi, int k8);
53extern void free_initmem(void); 58extern void free_initmem(void);
54 59
55#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 7e172955ee57..a09e1f052d84 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -451,6 +451,26 @@ do { \
451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#endif /* !CONFIG_M386 */ 452#endif /* !CONFIG_M386 */
453 453
454#ifdef CONFIG_X86_CMPXCHG64
455#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
456({ \
457 char __ret; \
458 typeof(o1) __o1 = o1; \
459 typeof(o1) __n1 = n1; \
460 typeof(o2) __o2 = o2; \
461 typeof(o2) __n2 = n2; \
462 typeof(o2) __dummy = n2; \
463 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
464 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
465 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
466 __ret; \
467})
468
469#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
470#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
471#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#endif /* CONFIG_X86_CMPXCHG64 */
473
454/* 474/*
455 * Per cpu atomic 64 bit operations are only available under 64 bit. 475 * Per cpu atomic 64 bit operations are only available under 64 bit.
456 * 32 bit must fall back to generic operations. 476 * 32 bit must fall back to generic operations.
@@ -480,6 +500,34 @@ do { \
480#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 500#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
481#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 501#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
482#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 502#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
503
504/*
505 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
506 * is not supported on early AMD64 processors so we must be able to emulate
507 * it in software. The address used in the cmpxchg16 instruction must be
508 * aligned to a 16 byte boundary.
509 */
510#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
511({ \
512 char __ret; \
513 typeof(o1) __o1 = o1; \
514 typeof(o1) __n1 = n1; \
515 typeof(o2) __o2 = o2; \
516 typeof(o2) __n2 = n2; \
517 typeof(o2) __dummy; \
518 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
519 "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \
520 X86_FEATURE_CX16, \
521 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
522 "S" (&pcp1), "b"(__n1), "c"(__n2), \
523 "a"(__o1), "d"(__o2)); \
524 __ret; \
525})
526
527#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
528#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
529#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
530
483#endif 531#endif
484 532
485/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 533/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index cc29086e30cd..56fd9e3abbda 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 */ 3 */
4 4
5#ifndef PERF_EVENT_P4_H 5#ifndef PERF_EVENT_P4_H
@@ -9,7 +9,7 @@
9#include <linux/bitops.h> 9#include <linux/bitops.h>
10 10
11/* 11/*
12 * NetBurst has perfomance MSRs shared between 12 * NetBurst has performance MSRs shared between
13 * threads if HT is turned on, ie for both logical 13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support 14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its 15 * perf-MSRs are not shared and every thread has its
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 94b979d1b58d..effff47a3c82 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd)
69 69
70static inline void pud_clear(pud_t *pudp) 70static inline void pud_clear(pud_t *pudp)
71{ 71{
72 unsigned long pgd;
73
74 set_pud(pudp, __pud(0)); 72 set_pud(pudp, __pud(0));
75 73
76 /* 74 /*
@@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp)
79 * section 8.1: in PAE mode we explicitly have to flush the 77 * section 8.1: in PAE mode we explicitly have to flush the
80 * TLB via cr3 if the top-level pgd is changed... 78 * TLB via cr3 if the top-level pgd is changed...
81 * 79 *
82 * Make sure the pud entry we're updating is within the 80 * Currently all places where pud_clear() is called either have
83 * current pgd to avoid unnecessary TLB flushes. 81 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
82 * pud_clear_bad()), so we don't need TLB flush here.
84 */ 83 */
85 pgd = read_cr3();
86 if (__pa(pudp) >= pgd && __pa(pudp) <
87 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
88 write_cr3(pgd);
89} 84}
90 85
91#ifdef CONFIG_SMP 86#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..a898a2b6e10c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -7,7 +7,7 @@
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ 10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ 12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ 13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa186..4c25ab48257b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
94 int x86_cache_alignment; /* In bytes */ 94 int x86_cache_alignment; /* In bytes */
95 int x86_power; 95 int x86_power;
96 unsigned long loops_per_jiffy; 96 unsigned long loops_per_jiffy;
97#ifdef CONFIG_SMP
98 /* cpus sharing the last level cache: */
99 cpumask_var_t llc_shared_map;
100#endif
101 /* cpuid returned max cores value: */ 97 /* cpuid returned max cores value: */
102 u16 x86_max_cores; 98 u16 x86_max_cores;
103 u16 apicid; 99 u16 apicid;
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b4ec95f07518..971e0b46446e 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -1 +1,69 @@
1/* dummy prom.h; here to make linux/of.h's #includes happy */ 1/*
2 * Definitions for Device tree / OpenFirmware handling on X86
3 *
4 * based on arch/powerpc/include/asm/prom.h which is
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _ASM_X86_PROM_H
14#define _ASM_X86_PROM_H
15#ifndef __ASSEMBLY__
16
17#include <linux/of.h>
18#include <linux/types.h>
19#include <linux/pci.h>
20
21#include <asm/irq.h>
22#include <asm/atomic.h>
23#include <asm/setup.h>
24#include <asm/irq_controller.h>
25
26#ifdef CONFIG_OF
27extern int of_ioapic;
28extern u64 initial_dtb;
29extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else
45static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { }
47static inline void x86_of_pci_init(void) { }
48static inline void x86_dtb_init(void) { }
49#define of_ioapic 0
50#endif
51
52extern char cmd_line[COMMAND_LINE_SIZE];
53
54#define pci_address_to_pio pci_address_to_pio
55unsigned long pci_address_to_pio(phys_addr_t addr);
56
57/**
58 * irq_dispose_mapping - Unmap an interrupt
59 * @virq: linux virq number of the interrupt to unmap
60 *
61 * FIXME: We really should implement proper virq handling like power,
62 * but that's going to be major surgery.
63 */
64static inline void irq_dispose_mapping(unsigned int virq) { }
65
66#define HAVE_ARCH_DEVTREE_FIXUPS
67
68#endif /* __ASSEMBLY__ */
69#endif
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 52b098a6eebb..7b0a55a88851 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -31,7 +31,7 @@
31#define R12 24 31#define R12 24
32#define RBP 32 32#define RBP 32
33#define RBX 40 33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/ 34/* arguments: interrupts/non tracing syscalls only save up to here*/
35#define R11 48 35#define R11 48
36#define R10 56 36#define R10 56
37#define R9 64 37#define R9 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 78cd1ea94500..1babf8adecdf 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -73,7 +73,7 @@ struct pt_regs {
73 unsigned long r12; 73 unsigned long r12;
74 unsigned long rbp; 74 unsigned long rbp;
75 unsigned long rbx; 75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/ 76/* arguments: non interrupts/non tracing syscalls only save up to here*/
77 unsigned long r11; 77 unsigned long r11;
78 unsigned long r10; 78 unsigned long r10;
79 unsigned long r9; 79 unsigned long r9;
@@ -103,7 +103,7 @@ struct pt_regs {
103 unsigned long r12; 103 unsigned long r12;
104 unsigned long bp; 104 unsigned long bp;
105 unsigned long bx; 105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/ 106/* arguments: non interrupts/non tracing syscalls only save up to here*/
107 unsigned long r11; 107 unsigned long r11;
108 unsigned long r10; 108 unsigned long r10;
109 unsigned long r9; 109 unsigned long r9;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..3250e3d605d9 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,7 +18,10 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */
23#define MRR_BIOS 0
24#define MRR_APM 1
22 25
23typedef void (*nmi_shootdown_cb)(int, struct die_args*); 26typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback); 27void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b60..df4cd32b4cc6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,26 +37,9 @@
37#endif 37#endif
38 38
39#ifdef __KERNEL__ 39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44#include <asm/asm.h> 40#include <asm/asm.h>
45 41
46struct rwsem_waiter;
47
48extern asmregparm struct rw_semaphore *
49 rwsem_down_read_failed(struct rw_semaphore *sem);
50extern asmregparm struct rw_semaphore *
51 rwsem_down_write_failed(struct rw_semaphore *sem);
52extern asmregparm struct rw_semaphore *
53 rwsem_wake(struct rw_semaphore *);
54extern asmregparm struct rw_semaphore *
55 rwsem_downgrade_wake(struct rw_semaphore *sem);
56
57/* 42/*
58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of 43 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647 44 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits. 45 * for 64 bits.
@@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore *
74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 57#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
76 59
77typedef signed long rwsem_count_t;
78
79struct rw_semaphore {
80 rwsem_count_t count;
81 spinlock_t wait_lock;
82 struct list_head wait_list;
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 struct lockdep_map dep_map;
85#endif
86};
87
88#ifdef CONFIG_DEBUG_LOCK_ALLOC
89# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
90#else
91# define __RWSEM_DEP_MAP_INIT(lockname)
92#endif
93
94
95#define __RWSEM_INITIALIZER(name) \
96{ \
97 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
98 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
99}
100
101#define DECLARE_RWSEM(name) \
102 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
103
104extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
105 struct lock_class_key *key);
106
107#define init_rwsem(sem) \
108do { \
109 static struct lock_class_key __key; \
110 \
111 __init_rwsem((sem), #sem, &__key); \
112} while (0)
113
114/* 60/*
115 * lock for reading 61 * lock for reading
116 */ 62 */
@@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem)
133 */ 79 */
134static inline int __down_read_trylock(struct rw_semaphore *sem) 80static inline int __down_read_trylock(struct rw_semaphore *sem)
135{ 81{
136 rwsem_count_t result, tmp; 82 long result, tmp;
137 asm volatile("# beginning __down_read_trylock\n\t" 83 asm volatile("# beginning __down_read_trylock\n\t"
138 " mov %0,%1\n\t" 84 " mov %0,%1\n\t"
139 "1:\n\t" 85 "1:\n\t"
@@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
155 */ 101 */
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 102static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 103{
158 rwsem_count_t tmp; 104 long tmp;
159 asm volatile("# beginning down_write\n\t" 105 asm volatile("# beginning down_write\n\t"
160 LOCK_PREFIX " xadd %1,(%2)\n\t" 106 LOCK_PREFIX " xadd %1,(%2)\n\t"
161 /* adds 0xffff0001, returns the old value */ 107 /* adds 0xffff0001, returns the old value */
@@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem)
180 */ 126 */
181static inline int __down_write_trylock(struct rw_semaphore *sem) 127static inline int __down_write_trylock(struct rw_semaphore *sem)
182{ 128{
183 rwsem_count_t ret = cmpxchg(&sem->count, 129 long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
184 RWSEM_UNLOCKED_VALUE, 130 RWSEM_ACTIVE_WRITE_BIAS);
185 RWSEM_ACTIVE_WRITE_BIAS);
186 if (ret == RWSEM_UNLOCKED_VALUE) 131 if (ret == RWSEM_UNLOCKED_VALUE)
187 return 1; 132 return 1;
188 return 0; 133 return 0;
@@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
193 */ 138 */
194static inline void __up_read(struct rw_semaphore *sem) 139static inline void __up_read(struct rw_semaphore *sem)
195{ 140{
196 rwsem_count_t tmp; 141 long tmp;
197 asm volatile("# beginning __up_read\n\t" 142 asm volatile("# beginning __up_read\n\t"
198 LOCK_PREFIX " xadd %1,(%2)\n\t" 143 LOCK_PREFIX " xadd %1,(%2)\n\t"
199 /* subtracts 1, returns the old value */ 144 /* subtracts 1, returns the old value */
@@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem)
211 */ 156 */
212static inline void __up_write(struct rw_semaphore *sem) 157static inline void __up_write(struct rw_semaphore *sem)
213{ 158{
214 rwsem_count_t tmp; 159 long tmp;
215 asm volatile("# beginning __up_write\n\t" 160 asm volatile("# beginning __up_write\n\t"
216 LOCK_PREFIX " xadd %1,(%2)\n\t" 161 LOCK_PREFIX " xadd %1,(%2)\n\t"
217 /* subtracts 0xffff0001, returns the old value */ 162 /* subtracts 0xffff0001, returns the old value */
@@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
247/* 192/*
248 * implement atomic add functionality 193 * implement atomic add functionality
249 */ 194 */
250static inline void rwsem_atomic_add(rwsem_count_t delta, 195static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
251 struct rw_semaphore *sem)
252{ 196{
253 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" 197 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
254 : "+m" (sem->count) 198 : "+m" (sem->count)
@@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
258/* 202/*
259 * implement exchange and add functionality 203 * implement exchange and add functionality
260 */ 204 */
261static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, 205static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
262 struct rw_semaphore *sem)
263{ 206{
264 rwsem_count_t tmp = delta; 207 long tmp = delta;
265 208
266 asm volatile(LOCK_PREFIX "xadd %0,%1" 209 asm volatile(LOCK_PREFIX "xadd %0,%1"
267 : "+r" (tmp), "+m" (sem->count) 210 : "+r" (tmp), "+m" (sem->count)
@@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
270 return tmp + delta; 213 return tmp + delta;
271} 214}
272 215
273static inline int rwsem_is_locked(struct rw_semaphore *sem)
274{
275 return (sem->count != 0);
276}
277
278#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
279#endif /* _ASM_X86_RWSEM_H */ 217#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 231f1c1d6607..cd84f7208f76 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,14 +1,16 @@
1#ifndef _ASM_X86_SEGMENT_H 1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H 2#define _ASM_X86_SEGMENT_H
3 3
4#include <linux/const.h>
5
4/* Constructor for a conventional segment GDT (or LDT) entry */ 6/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */ 7/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \ 8#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \ 9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \ 10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \ 11 (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \ 12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
11 (((limit) & 0x0000ffffULL))) 13 (((limit) & _AC(0x0000ffff,ULL))))
12 14
13/* Simple and small GDT entries for booting only */ 15/* Simple and small GDT entries for booting only */
14 16
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1f4695136776..73b11bc0ae6f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,12 +17,24 @@
17#endif 17#endif
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/cpumask.h> 19#include <asm/cpumask.h>
20#include <asm/cpufeature.h>
20 21
21extern int smp_num_siblings; 22extern int smp_num_siblings;
22extern unsigned int num_processors; 23extern unsigned int num_processors;
23 24
25static inline bool cpu_has_ht_siblings(void)
26{
27 bool has_siblings = false;
28#ifdef CONFIG_SMP
29 has_siblings = cpu_has_ht && smp_num_siblings > 1;
30#endif
31 return has_siblings;
32}
33
24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU(int, cpu_number);
28 40
@@ -36,8 +48,16 @@ static inline struct cpumask *cpu_core_mask(int cpu)
36 return per_cpu(cpu_core_map, cpu); 48 return per_cpu(cpu_core_map, cpu);
37} 49}
38 50
51static inline struct cpumask *cpu_llc_shared_mask(int cpu)
52{
53 return per_cpu(cpu_llc_shared_map, cpu);
54}
55
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
60#endif
41 61
42/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
43extern unsigned long stack_start; /* Initial stack pointer address */ 63extern unsigned long stack_start; /* Initial stack pointer address */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 52b5c7ed3608..d7e89c83645d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -47,7 +47,7 @@ struct stacktrace_ops {
47}; 47};
48 48
49void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
50 unsigned long *stack, 50 unsigned long *stack, unsigned long bp,
51 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
52 52
53#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -86,11 +86,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs)
86 86
87extern void 87extern void
88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
89 unsigned long *stack, char *log_lvl); 89 unsigned long *stack, unsigned long bp, char *log_lvl);
90 90
91extern void 91extern void
92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
93 unsigned long *sp, char *log_lvl); 93 unsigned long *sp, unsigned long bp, char *log_lvl);
94 94
95extern unsigned int code_bytes; 95extern unsigned int code_bytes;
96 96
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3ea8782..12569e691ce3 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do { \
98 */ 98 */
99#define HAVE_DISABLE_HLT 99#define HAVE_DISABLE_HLT
100#else 100#else
101#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
102#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
103 101
104/* frame pointer must be last for get_wchan */ 102/* frame pointer must be last for get_wchan */
105#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" 103#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a0..1f2e61e28981 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -161,8 +161,14 @@ struct thread_info {
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
163 163
164#define alloc_thread_info(tsk) \ 164#define alloc_thread_info_node(tsk, node) \
165 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) 165({ \
166 struct page *page = alloc_pages_node(node, THREAD_FLAGS, \
167 THREAD_ORDER); \
168 struct thread_info *ret = page ? page_address(page) : NULL; \
169 \
170 ret; \
171})
166 172
167#ifdef CONFIG_X86_32 173#ifdef CONFIG_X86_32
168 174
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc31e52..910a7084f7f2 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
47 47
48#include <asm/mpspec.h> 48#include <asm/mpspec.h>
49 49
50#ifdef CONFIG_X86_32
51
52/* Mappings between logical cpu number and node number */
53extern int cpu_to_node_map[];
54
55/* Returns the number of the node containing CPU 'cpu' */
56static inline int __cpu_to_node(int cpu)
57{
58 return cpu_to_node_map[cpu];
59}
60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
62
63#else /* CONFIG_X86_64 */
64
65/* Mappings between logical cpu number and node number */ 50/* Mappings between logical cpu number and node number */
66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 51DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
67 52
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
84 69
85#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
86 71
87#endif /* CONFIG_X86_64 */
88
89/* Mappings between node number and cpus on that node. */ 72/* Mappings between node number and cpus on that node. */
90extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 73extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
91 74
@@ -155,7 +138,7 @@ extern unsigned long node_remap_size[];
155 .balance_interval = 1, \ 138 .balance_interval = 1, \
156} 139}
157 140
158#ifdef CONFIG_X86_64_ACPI_NUMA 141#ifdef CONFIG_X86_64
159extern int __node_distance(int, int); 142extern int __node_distance(int, int);
160#define node_distance(a, b) __node_distance(a, b) 143#define node_distance(a, b) __node_distance(a, b)
161#endif 144#endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index f4500fb3b485..feca3118a73b 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,25 +3,36 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE 6#include <linux/types.h>
7#include <asm/io.h>
8
7/* 9/*
8 * Trampoline 80x86 program as an array. 10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
9 */ 13 */
10extern const unsigned char trampoline_data []; 14extern const unsigned char x86_trampoline_start [];
11extern const unsigned char trampoline_end []; 15extern const unsigned char x86_trampoline_end [];
12extern unsigned char *trampoline_base; 16extern unsigned char *x86_trampoline_base;
13 17
14extern unsigned long init_rsp; 18extern unsigned long init_rsp;
15extern unsigned long initial_code; 19extern unsigned long initial_code;
16extern unsigned long initial_gs; 20extern unsigned long initial_gs;
17 21
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
19 30
20extern unsigned long setup_trampoline(void); 31/* Address of the SMP trampoline */
21extern void __init reserve_trampoline_memory(void); 32static inline unsigned long trampoline_address(void)
22#else 33{
23static inline void reserve_trampoline_memory(void) {} 34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
24#endif /* CONFIG_X86_TRAMPOLINE */ 35}
25 36
26#endif /* __ASSEMBLY__ */ 37#endif /* __ASSEMBLY__ */
27 38
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d03..83e2efd181e2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void)
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 /* 37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
40 */ 40 */
41#ifndef CONFIG_X86_TSC 41#ifndef CONFIG_X86_TSC
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index df1da20f4534..88102055a4b8 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,20 +1,12 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#define dma_addr_t dma_addr_t
5
6#include <asm-generic/types.h> 4#include <asm-generic/types.h>
7 5
8#ifdef __KERNEL__ 6#ifdef __KERNEL__
9#ifndef __ASSEMBLY__ 7#ifndef __ASSEMBLY__
10 8
11typedef u64 dma64_addr_t; 9typedef u64 dma64_addr_t;
12#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
13/* DMA addresses come in 32-bit and 64-bit flavours. */
14typedef u64 dma_addr_t;
15#else
16typedef u32 dma_addr_t;
17#endif
18 10
19#endif /* __ASSEMBLY__ */ 11#endif /* __ASSEMBLY__ */
20#endif /* __KERNEL__ */ 12#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e8ba0e..a755ef5e5977 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,14 @@
346#define __NR_fanotify_init 338 346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339 347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340 348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
349 353
350#ifdef __KERNEL__ 354#ifdef __KERNEL__
351 355
352#define NR_syscalls 341 356#define NR_syscalls 345
353 357
354#define __ARCH_WANT_IPC_PARSE_VERSION 358#define __ARCH_WANT_IPC_PARSE_VERSION
355#define __ARCH_WANT_OLD_READDIR 359#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8a715b..160fa76bd578 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,14 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302 670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64) 671__SYSCALL(__NR_prlimit64, sys_prlimit64)
672#define __NR_name_to_handle_at 303
673__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
674#define __NR_open_by_handle_at 304
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs)
672 680
673#ifndef __NO_STUBS 681#ifndef __NO_STUBS
674#define __ARCH_WANT_OLD_READDIR 682#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 64642ad019fb..643ebf2e2ad8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,11 +83,13 @@ struct x86_init_paging {
83 * boot cpu 83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init 84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET) 85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 * @wallclock_init: init the wallclock device
86 */ 87 */
87struct x86_init_timers { 88struct x86_init_timers {
88 void (*setup_percpu_clockev)(void); 89 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void); 90 void (*tsc_pre_init)(void);
90 void (*timer_init)(void); 91 void (*timer_init)(void);
92 void (*wallclock_init)(void);
91}; 93};
92 94
93/** 95/**
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae4025b..8508bfe52296 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
287static inline int 287static inline int
288HYPERVISOR_sched_op(int cmd, void *arg) 288HYPERVISOR_sched_op(int cmd, void *arg)
289{ 289{
290 return _hypercall2(int, sched_op_new, cmd, arg); 290 return _hypercall2(int, sched_op, cmd, arg);
291} 291}
292 292
293static inline long 293static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
422#endif 422#endif
423 423
424static inline int 424static inline int
425HYPERVISOR_suspend(unsigned long srec) 425HYPERVISOR_suspend(unsigned long start_info_mfn)
426{ 426{
427 return _hypercall3(int, sched_op, SCHEDOP_shutdown, 427 struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
428 SHUTDOWN_suspend, srec); 428
429 /*
430 * For a PV guest the tools require that the start_info mfn be
431 * present in rdx/edx when the hypercall is made. Per the
432 * hypercall calling convention this is the third hypercall
433 * argument, which is start_info_mfn here.
434 */
435 return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
429} 436}
430 437
431static inline int 438static inline int
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 1c10c88ee4e1..5d4922ad4b9b 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void);
86 * The privilege level specifies which modes may enter a trap via a software 86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows: 88 * privilege levels as follows:
89 * Level == 0: Noone may enter 89 * Level == 0: No one may enter
90 * Level == 1: Kernel may enter 90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter 91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter 92 * Level == 3: Everyone may enter
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf238a33..c61934fbf22a 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
29 29
30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
31#define INVALID_P2M_ENTRY (~0UL) 31#define INVALID_P2M_ENTRY (~0UL)
32#define FOREIGN_FRAME_BIT (1UL<<31) 32#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
33#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
33#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 34#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
35#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
34 36
35/* Maximum amount of memory we can handle in a domain in pages */ 37/* Maximum amount of memory we can handle in a domain in pages */
36#define MAX_DOMAIN_PAGES \ 38#define MAX_DOMAIN_PAGES \
@@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order;
41 43
42extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
46extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e);
44 49
45extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page); 51extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn); 52extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49 54
55#ifdef CONFIG_XEN_DEBUG_FS
56extern int p2m_dump_show(struct seq_file *m, void *v);
57#endif
50static inline unsigned long pfn_to_mfn(unsigned long pfn) 58static inline unsigned long pfn_to_mfn(unsigned long pfn)
51{ 59{
52 unsigned long mfn; 60 unsigned long mfn;
@@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
57 mfn = get_phys_to_machine(pfn); 65 mfn = get_phys_to_machine(pfn);
58 66
59 if (mfn != INVALID_P2M_ENTRY) 67 if (mfn != INVALID_P2M_ENTRY)
60 mfn &= ~FOREIGN_FRAME_BIT; 68 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
61 69
62 return mfn; 70 return mfn;
63} 71}
@@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73static inline unsigned long mfn_to_pfn(unsigned long mfn) 81static inline unsigned long mfn_to_pfn(unsigned long mfn)
74{ 82{
75 unsigned long pfn; 83 unsigned long pfn;
84 int ret = 0;
76 85
77 if (xen_feature(XENFEAT_auto_translated_physmap)) 86 if (xen_feature(XENFEAT_auto_translated_physmap))
78 return mfn; 87 return mfn;
79 88
89 if (unlikely((mfn >> machine_to_phys_order) != 0)) {
90 pfn = ~0;
91 goto try_override;
92 }
80 pfn = 0; 93 pfn = 0;
81 /* 94 /*
82 * The array access can fail (e.g., device space beyond end of RAM). 95 * The array access can fail (e.g., device space beyond end of RAM).
83 * In such cases it doesn't matter what we return (we return garbage), 96 * In such cases it doesn't matter what we return (we return garbage),
84 * but we must handle the fault without crashing! 97 * but we must handle the fault without crashing!
85 */ 98 */
86 __get_user(pfn, &machine_to_phys_mapping[mfn]); 99 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
87 100try_override:
88 /* 101 /* ret might be < 0 if there are no entries in the m2p for mfn */
89 * If this appears to be a foreign mfn (because the pfn 102 if (ret < 0)
90 * doesn't map back to the mfn), then check the local override 103 pfn = ~0;
91 * table to see if there's a better pfn to use. 104 else if (get_phys_to_machine(pfn) != mfn)
105 /*
106 * If this appears to be a foreign mfn (because the pfn
107 * doesn't map back to the mfn), then check the local override
108 * table to see if there's a better pfn to use.
109 *
110 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
111 */
112 pfn = m2p_find_override_pfn(mfn, ~0);
113
114 /*
115 * pfn is ~0 if there are no entries in the m2p for mfn or if the
116 * entry doesn't map back to the mfn and m2p_override doesn't have a
117 * valid entry for it.
92 */ 118 */
93 if (get_phys_to_machine(pfn) != mfn) 119 if (pfn == ~0 &&
94 pfn = m2p_find_override_pfn(mfn, pfn); 120 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
121 pfn = mfn;
95 122
96 return pfn; 123 return pfn;
97} 124}
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3eaf8d3..aa8620989162 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void)
27 * its own functions. 27 * its own functions.
28 */ 28 */
29struct xen_pci_frontend_ops { 29struct xen_pci_frontend_ops {
30 int (*enable_msi)(struct pci_dev *dev, int **vectors); 30 int (*enable_msi)(struct pci_dev *dev, int vectors[]);
31 void (*disable_msi)(struct pci_dev *dev); 31 void (*disable_msi)(struct pci_dev *dev);
32 int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); 32 int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
33 void (*disable_msix)(struct pci_dev *dev); 33 void (*disable_msix)(struct pci_dev *dev);
34}; 34};
35 35
36extern struct xen_pci_frontend_ops *xen_pci_frontend; 36extern struct xen_pci_frontend_ops *xen_pci_frontend;
37 37
38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, 38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
39 int **vectors) 39 int vectors[])
40{ 40{
41 if (xen_pci_frontend && xen_pci_frontend->enable_msi) 41 if (xen_pci_frontend && xen_pci_frontend->enable_msi)
42 return xen_pci_frontend->enable_msi(dev, vectors); 42 return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
48 xen_pci_frontend->disable_msi(dev); 48 xen_pci_frontend->disable_msi(dev);
49} 49}
50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, 50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
51 int **vectors, int nvec) 51 int vectors[], int nvec)
52{ 52{
53 if (xen_pci_frontend && xen_pci_frontend->enable_msix) 53 if (xen_pci_frontend && xen_pci_frontend->enable_msix)
54 return xen_pci_frontend->enable_msix(dev, vectors, nvec); 54 return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd880..7338ef2218bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
43obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o 48obj-y += resource.o
49 49
50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-y += trampoline.o trampoline_$(BITS).o
51obj-y += process.o 51obj-y += process.o
52obj-y += i387.o xsave.o 52obj-y += i387.o xsave.o
53obj-y += ptrace.o 53obj-y += ptrace.o
@@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o
55obj-$(CONFIG_IA32_EMULATION) += tls.o 55obj-$(CONFIG_IA32_EMULATION) += tls.o
56obj-y += step.o 56obj-y += step.o
57obj-$(CONFIG_INTEL_TXT) += tboot.o 57obj-$(CONFIG_INTEL_TXT) += tboot.o
58obj-$(CONFIG_ISA_DMA_API) += i8237.o
58obj-$(CONFIG_STACKTRACE) += stacktrace.o 59obj-$(CONFIG_STACKTRACE) += stacktrace.o
59obj-y += cpu/ 60obj-y += cpu/
60obj-y += acpi/ 61obj-y += acpi/
61obj-y += reboot.o 62obj-y += reboot.o
63obj-$(CONFIG_X86_32) += reboot_32.o
62obj-$(CONFIG_MCA) += mca_32.o 64obj-$(CONFIG_MCA) += mca_32.o
63obj-$(CONFIG_X86_MSR) += msr.o 65obj-$(CONFIG_X86_MSR) += msr.o
64obj-$(CONFIG_X86_CPUID) += cpuid.o 66obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -66,10 +68,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
66apm-y := apm_32.o 68apm-y := apm_32.o
67obj-$(CONFIG_APM) += apm.o 69obj-$(CONFIG_APM) += apm.o
68obj-$(CONFIG_SMP) += smp.o 70obj-$(CONFIG_SMP) += smp.o
69obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 71obj-$(CONFIG_SMP) += smpboot.o
72obj-$(CONFIG_SMP) += tsc_sync.o
70obj-$(CONFIG_SMP) += setup_percpu.o 73obj-$(CONFIG_SMP) += setup_percpu.o
71obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
73obj-$(CONFIG_X86_MPPARSE) += mpparse.o 74obj-$(CONFIG_X86_MPPARSE) += mpparse.o
74obj-y += apic/ 75obj-y += apic/
75obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 76obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -109,6 +110,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
110 111
111obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 112obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
113obj-$(CONFIG_OF) += devicetree.o
112 114
113### 115###
114# 64 bit specific files 116# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3e6e2d68f761..9a966c579af5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -595,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
595 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
596 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
597 return; 597 return;
598#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
599 apicid_to_node[physid] = nid;
600 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
601#else /* CONFIG_X86_32 */
602 apicid_2_node[physid] = nid;
603 cpu_to_node_map[cpu] = nid;
604#endif
605
606#endif 600#endif
607} 601}
608 602
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..ead21b663117 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 36wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 37wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 38wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 39signature: .long WAKEUP_HEADER_SIGNATURE
34 40
35 .text 41 .text
36 .globl _start
37 .code16 42 .code16
38wakeup_code: 43wakeup_code:
39_start:
40 cli
41 cld 44 cld
42 45
43 /* Apparently some dimwit BIOS programmers don't know how to 46 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,12 +80,12 @@ _start:
77 80
78 /* Check header signature... */ 81 /* Check header signature... */
79 movl signature, %eax 82 movl signature, %eax
80 cmpl $0x51ee1111, %eax 83 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 84 jne bogus_real_magic
82 85
83 /* Check we really have everything... */ 86 /* Check we really have everything... */
84 movl end_signature, %eax 87 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 88 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 89 jne bogus_real_magic
87 90
88 /* Call the C code */ 91 /* Call the C code */
@@ -147,3 +150,7 @@ wakeup_heap:
147wakeup_stack: 150wakeup_stack:
148 .space 2048 151 .space 2048
149wakeup_stack_end: 152wakeup_stack_end:
153
154 .section ".signature","a"
155end_signature:
156 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..e1828c07e79c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -35,7 +35,8 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 35extern struct wakeup_header wakeup_header;
36#endif 36#endif
37 37
38#define HEADER_OFFSET 0x3f00 38#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 39#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
40#define WAKEUP_END_SIGNATURE 0x65a22c82
40 41
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 42#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 5f1b747f6ef1..ff93bc1b09c3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -18,12 +18,8 @@
18#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
19#include "sleep.h" 19#include "sleep.h"
20 20
21unsigned long acpi_wakeup_address;
22unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
23 22
24/* address in low memory of the wakeup routine. */
25static unsigned long acpi_realmode;
26
27#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
28static char temp_stack[4096]; 24static char temp_stack[4096];
29#endif 25#endif
@@ -33,22 +29,17 @@ static char temp_stack[4096];
33 * 29 *
34 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
35 * low memory. 31 * low memory.
36 *
37 * Note that this is too late to change acpi_wakeup_address.
38 */ 32 */
39int acpi_suspend_lowlevel(void) 33int acpi_suspend_lowlevel(void)
40{ 34{
41 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
42 38
43 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
44 printk(KERN_ERR "Could not allocate memory during boot, "
45 "S3 disabled\n");
46 return -ENOMEM;
47 }
48 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
49 40
50 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
51 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
52 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
53 return -EINVAL; 44 return -EINVAL;
54 } 45 }
@@ -68,9 +59,7 @@ int acpi_suspend_lowlevel(void)
68 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
69 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
70 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
71 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
72 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
73 << 16);
74 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
75 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
76 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -96,7 +85,7 @@ int acpi_suspend_lowlevel(void)
96 header->pmode_cr3 = (u32)__pa(&initial_page_table); 85 header->pmode_cr3 = (u32)__pa(&initial_page_table);
97 saved_magic = 0x12345678; 86 saved_magic = 0x12345678;
98#else /* CONFIG_64BIT */ 87#else /* CONFIG_64BIT */
99 header->trampoline_segment = setup_trampoline() >> 4; 88 header->trampoline_segment = trampoline_address() >> 4;
100#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
101 stack_start = (unsigned long)temp_stack + sizeof(temp_stack); 90 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 91 early_gdt_descr.address =
@@ -111,45 +100,6 @@ int acpi_suspend_lowlevel(void)
111 return 0; 100 return 0;
112} 101}
113 102
114/**
115 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
116 *
117 * We allocate a page from the first 1MB of memory for the wakeup
118 * routine for when we come back from a sleep state. The
119 * runtime allocator allows specification of <16MB pages, but not
120 * <1MB pages.
121 */
122void __init acpi_reserve_wakeup_memory(void)
123{
124 phys_addr_t mem;
125
126 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
127 printk(KERN_ERR
128 "ACPI: Wakeup code way too big, S3 disabled.\n");
129 return;
130 }
131
132 mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
133
134 if (mem == MEMBLOCK_ERROR) {
135 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
136 return;
137 }
138 acpi_realmode = (unsigned long) phys_to_virt(mem);
139 acpi_wakeup_address = mem;
140 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
141}
142
143int __init acpi_configure_wakeup_memory(void)
144{
145 if (acpi_realmode)
146 set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT);
147
148 return 0;
149}
150arch_initcall(acpi_configure_wakeup_memory);
151
152
153static int __init acpi_sleep_setup(char *str) 103static int __init acpi_sleep_setup(char *str)
154{ 104{
155 while ((str != NULL) && (*str != '\0')) { 105 while ((str != NULL) && (*str != '\0')) {
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 31ce13f20297..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,10 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7038b95d363f..4a234677e213 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -199,7 +199,7 @@ void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 201 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 202 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 203 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 204 Tough. Make sure you disable such features by hand. */
205 205
@@ -620,7 +620,12 @@ static int __kprobes stop_machine_text_poke(void *data)
620 flush_icache_range((unsigned long)p->addr, 620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len); 621 (unsigned long)p->addr + p->len);
622 } 622 }
623 623 /*
624 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
625 * that a core serializing instruction such as "cpuid" should be
626 * executed on _each_ core before the new instruction is made visible.
627 */
628 sync_core();
624 return 0; 629 return 0;
625} 630}
626 631
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 0a99f7198bc3..6801959a8b2a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,14 +12,19 @@
12 12
13static u32 *flush_words; 13static u32 *flush_words;
14 14
15struct pci_device_id amd_nb_misc_ids[] = { 15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) },
25 {}
26};
27
23const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { 28const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
24 { 0x00, 0x18, 0x20 }, 29 { 0x00, 0x18, 0x20 },
25 { 0xff, 0x00, 0x20 }, 30 { 0xff, 0x00, 0x20 },
@@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges;
31EXPORT_SYMBOL(amd_northbridges); 36EXPORT_SYMBOL(amd_northbridges);
32 37
33static struct pci_dev *next_northbridge(struct pci_dev *dev, 38static struct pci_dev *next_northbridge(struct pci_dev *dev,
34 struct pci_device_id *ids) 39 const struct pci_device_id *ids)
35{ 40{
36 do { 41 do {
37 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); 42 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
@@ -43,9 +48,9 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
43 48
44int amd_cache_northbridges(void) 49int amd_cache_northbridges(void)
45{ 50{
46 int i = 0; 51 u16 i = 0;
47 struct amd_northbridge *nb; 52 struct amd_northbridge *nb;
48 struct pci_dev *misc; 53 struct pci_dev *misc, *link;
49 54
50 if (amd_nb_num()) 55 if (amd_nb_num())
51 return 0; 56 return 0;
@@ -64,10 +69,12 @@ int amd_cache_northbridges(void)
64 amd_northbridges.nb = nb; 69 amd_northbridges.nb = nb;
65 amd_northbridges.num = i; 70 amd_northbridges.num = i;
66 71
67 misc = NULL; 72 link = misc = NULL;
68 for (i = 0; i != amd_nb_num(); i++) { 73 for (i = 0; i != amd_nb_num(); i++) {
69 node_to_amd_nb(i)->misc = misc = 74 node_to_amd_nb(i)->misc = misc =
70 next_northbridge(misc, amd_nb_misc_ids); 75 next_northbridge(misc, amd_nb_misc_ids);
76 node_to_amd_nb(i)->link = link =
77 next_northbridge(link, amd_nb_link_ids);
71 } 78 }
72 79
73 /* some CPU families (e.g. family 0x11) do not support GART */ 80 /* some CPU families (e.g. family 0x11) do not support GART */
@@ -85,26 +92,95 @@ int amd_cache_northbridges(void)
85 boot_cpu_data.x86_mask >= 0x1)) 92 boot_cpu_data.x86_mask >= 0x1))
86 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; 93 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
87 94
95 if (boot_cpu_data.x86 == 0x15)
96 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
97
98 /* L3 cache partitioning is supported on family 0x15 */
99 if (boot_cpu_data.x86 == 0x15)
100 amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
101
88 return 0; 102 return 0;
89} 103}
90EXPORT_SYMBOL_GPL(amd_cache_northbridges); 104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
91 105
92/* Ignores subdevice/subvendor but as far as I can figure out 106/*
93 they're useless anyways */ 107 * Ignores subdevice/subvendor but as far as I can figure out
94int __init early_is_amd_nb(u32 device) 108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
95{ 111{
96 struct pci_device_id *id; 112 const struct pci_device_id *id;
97 u32 vendor = device & 0xffff; 113 u32 vendor = device & 0xffff;
114
98 device >>= 16; 115 device >>= 16;
99 for (id = amd_nb_misc_ids; id->vendor; id++) 116 for (id = amd_nb_misc_ids; id->vendor; id++)
100 if (vendor == id->vendor && device == id->device) 117 if (vendor == id->vendor && device == id->device)
101 return 1; 118 return true;
119 return false;
120}
121
122int amd_get_subcaches(int cpu)
123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask;
126 int cuid = 0;
127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0;
130
131 pci_read_config_dword(link, 0x1d4, &mask);
132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf;
137}
138
139int amd_set_subcaches(int cpu, int mask)
140{
141 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg;
144 int cuid = 0;
145
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL;
148
149 /* if necessary, collect reset state of L3 partitioning and BAN mode */
150 if (reset == 0) {
151 pci_read_config_dword(nb->link, 0x1d4, &reset);
152 pci_read_config_dword(nb->misc, 0x1b8, &ban);
153 ban &= 0x180000;
154 }
155
156 /* deactivate BAN mode if any subcaches are to be disabled */
157 if (mask != 0xf) {
158 pci_read_config_dword(nb->misc, 0x1b8, &reg);
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 }
161
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26;
167
168 pci_write_config_dword(nb->link, 0x1d4, mask);
169
170 /* reset BAN mode if L3 partitioning returned to reset state */
171 pci_read_config_dword(nb->link, 0x1d4, &reg);
172 if (reg == reset) {
173 pci_read_config_dword(nb->misc, 0x1b8, &reg);
174 reg &= ~0x180000;
175 pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
176 }
177
102 return 0; 178 return 0;
103} 179}
104 180
105int amd_cache_gart(void) 181static int amd_cache_gart(void)
106{ 182{
107 int i; 183 u16 i;
108 184
109 if (!amd_nb_has_feature(AMD_NB_GART)) 185 if (!amd_nb_has_feature(AMD_NB_GART))
110 return 0; 186 return 0;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 51d4e1663066..1293c709ee85 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -508,64 +508,12 @@ static int apbt_next_event(unsigned long delta,
508 return 0; 508 return 0;
509} 509}
510 510
511/*
512 * APB timer clock is not in sync with pclk on Langwell, which translates to
513 * unreliable read value caused by sampling error. the error does not add up
514 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
515 * would go backwards. the following code is trying to prevent time traveling
516 * backwards. little bit paranoid.
517 */
518static cycle_t apbt_read_clocksource(struct clocksource *cs) 511static cycle_t apbt_read_clocksource(struct clocksource *cs)
519{ 512{
520 unsigned long t0, t1, t2; 513 unsigned long current_count;
521 static unsigned long last_read; 514
522 515 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
523bad_count: 516 return (cycle_t)~current_count;
524 t1 = apbt_readl(phy_cs_timer_id,
525 APBTMR_N_CURRENT_VALUE);
526 t2 = apbt_readl(phy_cs_timer_id,
527 APBTMR_N_CURRENT_VALUE);
528 if (unlikely(t1 < t2)) {
529 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
530 t1, t2, t2 - t1);
531 goto bad_count;
532 }
533 /*
534 * check against cached last read, makes sure time does not go back.
535 * it could be a normal rollover but we will do tripple check anyway
536 */
537 if (unlikely(t2 > last_read)) {
538 /* check if we have a normal rollover */
539 unsigned long raw_intr_status =
540 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
541 /*
542 * cs timer interrupt is masked but raw intr bit is set if
543 * rollover occurs. then we read EOI reg to clear it.
544 */
545 if (raw_intr_status & (1 << phy_cs_timer_id)) {
546 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
547 goto out;
548 }
549 pr_debug("APB CS going back %lx:%lx:%lx ",
550 t2, last_read, t2 - last_read);
551bad_count_x3:
552 pr_debug("triple check enforced\n");
553 t0 = apbt_readl(phy_cs_timer_id,
554 APBTMR_N_CURRENT_VALUE);
555 udelay(1);
556 t1 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE);
558 udelay(1);
559 t2 = apbt_readl(phy_cs_timer_id,
560 APBTMR_N_CURRENT_VALUE);
561 if ((t2 > t1) || (t1 > t0)) {
562 printk(KERN_ERR "Error: APB CS tripple check failed\n");
563 goto bad_count_x3;
564 }
565 }
566out:
567 last_read = t2;
568 return (cycle_t)~t2;
569} 517}
570 518
571static int apbt_clocksource_register(void) 519static int apbt_clocksource_register(void)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5955a7800a96..86d1ad4962a7 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
57static u32 __init allocate_aperture(void) 57static u32 __init allocate_aperture(void)
58{ 58{
59 u32 aper_size; 59 u32 aper_size;
60 void *p; 60 unsigned long addr;
61 61
62 /* aper_size should <= 1G */ 62 /* aper_size should <= 1G */
63 if (fallback_aper_order > 5) 63 if (fallback_aper_order > 5)
@@ -73,7 +73,7 @@ static u32 __init allocate_aperture(void)
73 /* 73 /*
74 * using 512M as goal, in case kexec will load kernel_big 74 * using 512M as goal, in case kexec will load kernel_big
75 * that will do the on position decompress, and could overlap with 75 * that will do the on position decompress, and could overlap with
76 * that positon with gart that is used. 76 * that position with gart that is used.
77 * sequende: 77 * sequende:
78 * kernel_small 78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
@@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void)
83 * so don't use 512M below as gart iommu, leave the space for kernel 83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe 84 * code for safe
85 */ 85 */
86 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10);
91 return 0;
92 }
93 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
87 /* 94 /*
88 * Kmemleak should not scan this block as it may not be mapped via the 95 * Kmemleak should not scan this block as it may not be mapped via the
89 * kernel direct mapping. 96 * kernel direct mapping.
90 */ 97 */
91 kmemleak_ignore(p); 98 kmemleak_ignore(phys_to_virt(addr));
92 if (!p || __pa(p)+aper_size > 0xffffffff) {
93 printk(KERN_ERR
94 "Cannot allocate aperture memory hole (%p,%uK)\n",
95 p, aper_size>>10);
96 if (p)
97 free_bootmem(__pa(p), aper_size);
98 return 0;
99 }
100 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 99 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
101 aper_size >> 10, __pa(p)); 100 aper_size >> 10, addr);
102 insert_aperture_resource((u32)__pa(p), aper_size); 101 insert_aperture_resource((u32)addr, aper_size);
103 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 102 register_nosave_region(addr >> PAGE_SHIFT,
104 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 103 (addr+aper_size) >> PAGE_SHIFT);
105 104
106 return (u32)__pa(p); 105 return (u32)addr;
107} 106}
108 107
109 108
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d74978a..966673f44141 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -43,6 +43,7 @@
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
46#include <asm/desc.h> 47#include <asm/desc.h>
47#include <asm/hpet.h> 48#include <asm/hpet.h>
48#include <asm/idle.h> 49#include <asm/idle.h>
@@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
79 80
80#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
81/* 91/*
82 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
83 * 93 *
84 * +1=force-enable 94 * +1=force-enable
85 */ 95 */
86static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
87/* 97/*
88 * APIC command line parameters 98 * APIC command line parameters
89 */ 99 */
@@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
153unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
154int disable_apic; 164int disable_apic;
155/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
156static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
157/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
158int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
159EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -177,29 +187,8 @@ static struct resource lapic_resource = {
177 187
178static unsigned int calibration_result; 188static unsigned int calibration_result;
179 189
180static int lapic_next_event(unsigned long delta,
181 struct clock_event_device *evt);
182static void lapic_timer_setup(enum clock_event_mode mode,
183 struct clock_event_device *evt);
184static void lapic_timer_broadcast(const struct cpumask *mask);
185static void apic_pm_activate(void); 190static void apic_pm_activate(void);
186 191
187/*
188 * The local apic timer can be used for any function which is CPU local.
189 */
190static struct clock_event_device lapic_clockevent = {
191 .name = "lapic",
192 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
193 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
194 .shift = 32,
195 .set_mode = lapic_timer_setup,
196 .set_next_event = lapic_next_event,
197 .broadcast = lapic_timer_broadcast,
198 .rating = 100,
199 .irq = -1,
200};
201static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
202
203static unsigned long apic_phys; 192static unsigned long apic_phys;
204 193
205/* 194/*
@@ -238,7 +227,7 @@ static int modern_apic(void)
238 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
239 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
240 */ 229 */
241void apic_disable(void) 230static void __init apic_disable(void)
242{ 231{
243 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
244 apic = &apic_noop; 233 apic = &apic_noop;
@@ -282,23 +271,6 @@ u64 native_apic_icr_read(void)
282 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
283} 272}
284 273
285/**
286 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
287 */
288void __cpuinit enable_NMI_through_LVT0(void)
289{
290 unsigned int v;
291
292 /* unmask and set to NMI */
293 v = APIC_DM_NMI;
294
295 /* Level triggered for 82489DX (32bit mode) */
296 if (!lapic_is_integrated())
297 v |= APIC_LVT_LEVEL_TRIGGER;
298
299 apic_write(APIC_LVT0, v);
300}
301
302#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
303/** 275/**
304 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
508#endif 480#endif
509} 481}
510 482
483
484/*
485 * The local apic timer can be used for any function which is CPU local.
486 */
487static struct clock_event_device lapic_clockevent = {
488 .name = "lapic",
489 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
490 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
491 .shift = 32,
492 .set_mode = lapic_timer_setup,
493 .set_next_event = lapic_next_event,
494 .broadcast = lapic_timer_broadcast,
495 .rating = 100,
496 .irq = -1,
497};
498static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
499
511/* 500/*
512 * Setup the local APIC timer for this CPU. Copy the initialized values 501 * Setup the local APIC timer for this CPU. Copy the initialized values
513 * of the boot CPU and register the clock event in the framework. 502 * of the boot CPU and register the clock event in the framework.
@@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void)
1209 rdtscll(tsc); 1198 rdtscll(tsc);
1210 1199
1211 if (disable_apic) { 1200 if (disable_apic) {
1212 arch_disable_smp_support(); 1201 disable_ioapic_support();
1213 return; 1202 return;
1214 } 1203 }
1215 1204
@@ -1237,6 +1226,19 @@ void __cpuinit setup_local_APIC(void)
1237 */ 1226 */
1238 apic->init_apic_ldr(); 1227 apic->init_apic_ldr();
1239 1228
1229#ifdef CONFIG_X86_32
1230 /*
1231 * APIC LDR is initialized. If logical_apicid mapping was
1232 * initialized during get_smp_config(), make sure it matches the
1233 * actual value.
1234 */
1235 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1236 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id();
1240#endif
1241
1240 /* 1242 /*
1241 * Set Task Priority to 'accept all'. We never change this 1243 * Set Task Priority to 'accept all'. We never change this
1242 * later on. 1244 * later on.
@@ -1448,7 +1450,7 @@ int __init enable_IR(void)
1448void __init enable_IR_x2apic(void) 1450void __init enable_IR_x2apic(void)
1449{ 1451{
1450 unsigned long flags; 1452 unsigned long flags;
1451 struct IO_APIC_route_entry **ioapic_entries = NULL; 1453 struct IO_APIC_route_entry **ioapic_entries;
1452 int ret, x2apic_enabled = 0; 1454 int ret, x2apic_enabled = 0;
1453 int dmar_table_init_ret; 1455 int dmar_table_init_ret;
1454 1456
@@ -1537,7 +1539,7 @@ static int __init detect_init_APIC(void)
1537} 1539}
1538#else 1540#else
1539 1541
1540static int apic_verify(void) 1542static int __init apic_verify(void)
1541{ 1543{
1542 u32 features, h, l; 1544 u32 features, h, l;
1543 1545
@@ -1562,7 +1564,7 @@ static int apic_verify(void)
1562 return 0; 1564 return 0;
1563} 1565}
1564 1566
1565int apic_force_enable(void) 1567int __init apic_force_enable(unsigned long addr)
1566{ 1568{
1567 u32 h, l; 1569 u32 h, l;
1568 1570
@@ -1578,7 +1580,7 @@ int apic_force_enable(void)
1578 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1580 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1579 pr_info("Local APIC disabled by BIOS -- reenabling.\n"); 1581 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1580 l &= ~MSR_IA32_APICBASE_BASE; 1582 l &= ~MSR_IA32_APICBASE_BASE;
1581 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1583 l |= MSR_IA32_APICBASE_ENABLE | addr;
1582 wrmsr(MSR_IA32_APICBASE, l, h); 1584 wrmsr(MSR_IA32_APICBASE, l, h);
1583 enabled_via_apicbase = 1; 1585 enabled_via_apicbase = 1;
1584 } 1586 }
@@ -1619,7 +1621,7 @@ static int __init detect_init_APIC(void)
1619 "you can enable it with \"lapic\"\n"); 1621 "you can enable it with \"lapic\"\n");
1620 return -1; 1622 return -1;
1621 } 1623 }
1622 if (apic_force_enable()) 1624 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1623 return -1; 1625 return -1;
1624 } else { 1626 } else {
1625 if (apic_verify()) 1627 if (apic_verify())
@@ -1930,17 +1932,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1930{ 1932{
1931 int cpu; 1933 int cpu;
1932 1934
1933 /*
1934 * Validate version
1935 */
1936 if (version == 0x0) {
1937 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1938 "fixing up to 0x10. (tell your hw vendor)\n",
1939 version);
1940 version = 0x10;
1941 }
1942 apic_version[apicid] = version;
1943
1944 if (num_processors >= nr_cpu_ids) { 1935 if (num_processors >= nr_cpu_ids) {
1945 int max = nr_cpu_ids; 1936 int max = nr_cpu_ids;
1946 int thiscpu = max + disabled_cpus; 1937 int thiscpu = max + disabled_cpus;
@@ -1954,22 +1945,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1954 } 1945 }
1955 1946
1956 num_processors++; 1947 num_processors++;
1957 cpu = cpumask_next_zero(-1, cpu_present_mask);
1958
1959 if (version != apic_version[boot_cpu_physical_apicid])
1960 WARN_ONCE(1,
1961 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1962 apic_version[boot_cpu_physical_apicid], cpu, version);
1963
1964 physid_set(apicid, phys_cpu_present_map);
1965 if (apicid == boot_cpu_physical_apicid) { 1948 if (apicid == boot_cpu_physical_apicid) {
1966 /* 1949 /*
1967 * x86_bios_cpu_apicid is required to have processors listed 1950 * x86_bios_cpu_apicid is required to have processors listed
1968 * in same order as logical cpu numbers. Hence the first 1951 * in same order as logical cpu numbers. Hence the first
1969 * entry is BSP, and so on. 1952 * entry is BSP, and so on.
1953 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1954 * for BSP.
1970 */ 1955 */
1971 cpu = 0; 1956 cpu = 0;
1957 } else
1958 cpu = cpumask_next_zero(-1, cpu_present_mask);
1959
1960 /*
1961 * Validate version
1962 */
1963 if (version == 0x0) {
1964 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1965 cpu, apicid);
1966 version = 0x10;
1972 } 1967 }
1968 apic_version[apicid] = version;
1969
1970 if (version != apic_version[boot_cpu_physical_apicid]) {
1971 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1972 apic_version[boot_cpu_physical_apicid], cpu, version);
1973 }
1974
1975 physid_set(apicid, phys_cpu_present_map);
1973 if (apicid > max_physical_apicid) 1976 if (apicid > max_physical_apicid)
1974 max_physical_apicid = apicid; 1977 max_physical_apicid = apicid;
1975 1978
@@ -1977,7 +1980,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1977 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1980 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1978 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1981 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1979#endif 1982#endif
1980 1983#ifdef CONFIG_X86_32
1984 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1985 apic->x86_32_early_logical_apicid(cpu);
1986#endif
1981 set_cpu_possible(cpu, true); 1987 set_cpu_possible(cpu, true);
1982 set_cpu_present(cpu, true); 1988 set_cpu_present(cpu, true);
1983} 1989}
@@ -1998,10 +2004,14 @@ void default_init_apic_ldr(void)
1998} 2004}
1999 2005
2000#ifdef CONFIG_X86_32 2006#ifdef CONFIG_X86_32
2001int default_apicid_to_node(int logical_apicid) 2007int default_x86_32_numa_cpu_node(int cpu)
2002{ 2008{
2003#ifdef CONFIG_SMP 2009#ifdef CONFIG_NUMA
2004 return apicid_2_node[hard_smp_processor_id()]; 2010 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2011
2012 if (apicid != BAD_APICID)
2013 return __apicid_to_node[apicid];
2014 return NUMA_NO_NODE;
2005#else 2015#else
2006 return 0; 2016 return 0;
2007#endif 2017#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..5652d31fe108 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,8 +185,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 185 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 186 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 187 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 188 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 189 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 190 .setup_portio_remap = NULL,
@@ -337,8 +335,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 335 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 336 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 337 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 338 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 339 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 340 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..f1baa2dc087a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v)
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131} 120}
132 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
133struct apic apic_noop = { 130struct apic apic_noop = {
134 .name = "noop", 131 .name = "noop",
135 .probe = noop_probe, 132 .probe = noop_probe,
@@ -153,9 +150,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 150 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 151 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 152 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 153
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 154 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 155 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 156
@@ -197,4 +192,9 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 192 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 193 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 194 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
195
196#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif
200}; 200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..541a2e431659 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,7 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
259}; 257};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..3e9de4854c5b 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid) 513static int es7000_numa_cpu_node(int cpu)
508{ 514{
509 return 0; 515 return 0;
510} 516}
511 517
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 518static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 519{
515 if (!mps_cpu) 520 if (!mps_cpu)
@@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 533 ++cpu_id;
529} 534}
530 535
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 536static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 537{
545 /* For clustered we don't have a good way to do this yet - hack */ 538 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 554 * The cpus in the mask must all be on the apic cluster.
562 */ 555 */
563 for_each_cpu(cpu, cpumask) { 556 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 557 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 558
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 559 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 560 WARN(1, "Not a valid mask!");
@@ -578,7 +571,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 571es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 572 const struct cpumask *andmask)
580{ 573{
581 int apicid = es7000_cpu_to_logical_apicid(0); 574 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 575 cpumask_var_t cpumask;
583 576
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 577 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 648 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 649 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 650 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 651 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 652 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 653 .setup_portio_remap = NULL,
@@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 686 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 687 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
698}; 692};
699 693
700struct apic __refdata apic_es7000 = { 694struct apic __refdata apic_es7000 = {
@@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 714 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 715 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 716 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 717 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 718 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 719 .setup_portio_remap = NULL,
@@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 750 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 751 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
761}; 756};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79fd43ca6f96..c4e557a1ebb6 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -83,7 +83,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
83 arch_spin_lock(&lock); 83 arch_spin_lock(&lock);
84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
85 show_regs(regs); 85 show_regs(regs);
86 dump_stack();
87 arch_spin_unlock(&lock); 86 arch_spin_unlock(&lock);
88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 87 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
89 return NOTIFY_STOP; 88 return NOTIFY_STOP;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ca9e2a3545a9..180ca240e03c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
108 108
109int skip_ioapic_setup; 109int skip_ioapic_setup;
110 110
111void arch_disable_smp_support(void) 111/**
112 * disable_ioapic_support() - disables ioapic support at runtime
113 */
114void disable_ioapic_support(void)
112{ 115{
113#ifdef CONFIG_PCI 116#ifdef CONFIG_PCI
114 noioapicquirk = 1; 117 noioapicquirk = 1;
@@ -120,11 +123,14 @@ void arch_disable_smp_support(void)
120static int __init parse_noapic(char *str) 123static int __init parse_noapic(char *str)
121{ 124{
122 /* disable IO-APIC */ 125 /* disable IO-APIC */
123 arch_disable_smp_support(); 126 disable_ioapic_support();
124 return 0; 127 return 0;
125} 128}
126early_param("noapic", parse_noapic); 129early_param("noapic", parse_noapic);
127 130
131static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
132 struct io_apic_irq_attr *attr);
133
128/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ 134/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
129void mp_save_irq(struct mpc_intsrc *m) 135void mp_save_irq(struct mpc_intsrc *m)
130{ 136{
@@ -181,7 +187,7 @@ int __init arch_early_irq_init(void)
181 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); 187 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
182 188
183 for (i = 0; i < count; i++) { 189 for (i = 0; i < count; i++) {
184 set_irq_chip_data(i, &cfg[i]); 190 irq_set_chip_data(i, &cfg[i]);
185 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); 191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
186 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
187 /* 193 /*
@@ -200,7 +206,7 @@ int __init arch_early_irq_init(void)
200#ifdef CONFIG_SPARSE_IRQ 206#ifdef CONFIG_SPARSE_IRQ
201static struct irq_cfg *irq_cfg(unsigned int irq) 207static struct irq_cfg *irq_cfg(unsigned int irq)
202{ 208{
203 return get_irq_chip_data(irq); 209 return irq_get_chip_data(irq);
204} 210}
205 211
206static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) 212static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
@@ -226,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
226{ 232{
227 if (!cfg) 233 if (!cfg)
228 return; 234 return;
229 set_irq_chip_data(at, NULL); 235 irq_set_chip_data(at, NULL);
230 free_cpumask_var(cfg->domain); 236 free_cpumask_var(cfg->domain);
231 free_cpumask_var(cfg->old_domain); 237 free_cpumask_var(cfg->old_domain);
232 kfree(cfg); 238 kfree(cfg);
@@ -256,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
256 if (res < 0) { 262 if (res < 0) {
257 if (res != -EEXIST) 263 if (res != -EEXIST)
258 return NULL; 264 return NULL;
259 cfg = get_irq_chip_data(at); 265 cfg = irq_get_chip_data(at);
260 if (cfg) 266 if (cfg)
261 return cfg; 267 return cfg;
262 } 268 }
263 269
264 cfg = alloc_irq_cfg(at, node); 270 cfg = alloc_irq_cfg(at, node);
265 if (cfg) 271 if (cfg)
266 set_irq_chip_data(at, cfg); 272 irq_set_chip_data(at, cfg);
267 else 273 else
268 irq_free_desc(at); 274 irq_free_desc(at);
269 return cfg; 275 return cfg;
@@ -818,7 +824,7 @@ static int EISA_ELCR(unsigned int irq)
818#define default_MCA_trigger(idx) (1) 824#define default_MCA_trigger(idx) (1)
819#define default_MCA_polarity(idx) default_ISA_polarity(idx) 825#define default_MCA_polarity(idx) default_ISA_polarity(idx)
820 826
821static int MPBIOS_polarity(int idx) 827static int irq_polarity(int idx)
822{ 828{
823 int bus = mp_irqs[idx].srcbus; 829 int bus = mp_irqs[idx].srcbus;
824 int polarity; 830 int polarity;
@@ -860,7 +866,7 @@ static int MPBIOS_polarity(int idx)
860 return polarity; 866 return polarity;
861} 867}
862 868
863static int MPBIOS_trigger(int idx) 869static int irq_trigger(int idx)
864{ 870{
865 int bus = mp_irqs[idx].srcbus; 871 int bus = mp_irqs[idx].srcbus;
866 int trigger; 872 int trigger;
@@ -932,16 +938,6 @@ static int MPBIOS_trigger(int idx)
932 return trigger; 938 return trigger;
933} 939}
934 940
935static inline int irq_polarity(int idx)
936{
937 return MPBIOS_polarity(idx);
938}
939
940static inline int irq_trigger(int idx)
941{
942 return MPBIOS_trigger(idx);
943}
944
945static int pin_2_irq(int idx, int apic, int pin) 941static int pin_2_irq(int idx, int apic, int pin)
946{ 942{
947 int irq; 943 int irq;
@@ -1189,7 +1185,7 @@ void __setup_vector_irq(int cpu)
1189 raw_spin_lock(&vector_lock); 1185 raw_spin_lock(&vector_lock);
1190 /* Mark the inuse vectors */ 1186 /* Mark the inuse vectors */
1191 for_each_active_irq(irq) { 1187 for_each_active_irq(irq) {
1192 cfg = get_irq_chip_data(irq); 1188 cfg = irq_get_chip_data(irq);
1193 if (!cfg) 1189 if (!cfg)
1194 continue; 1190 continue;
1195 /* 1191 /*
@@ -1220,10 +1216,6 @@ void __setup_vector_irq(int cpu)
1220static struct irq_chip ioapic_chip; 1216static struct irq_chip ioapic_chip;
1221static struct irq_chip ir_ioapic_chip; 1217static struct irq_chip ir_ioapic_chip;
1222 1218
1223#define IOAPIC_AUTO -1
1224#define IOAPIC_EDGE 0
1225#define IOAPIC_LEVEL 1
1226
1227#ifdef CONFIG_X86_32 1219#ifdef CONFIG_X86_32
1228static inline int IO_APIC_irq_trigger(int irq) 1220static inline int IO_APIC_irq_trigger(int irq)
1229{ 1221{
@@ -1248,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq)
1248} 1240}
1249#endif 1241#endif
1250 1242
1251static void ioapic_register_intr(unsigned int irq, unsigned long trigger) 1243static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1244 unsigned long trigger)
1252{ 1245{
1246 struct irq_chip *chip = &ioapic_chip;
1247 irq_flow_handler_t hdl;
1248 bool fasteoi;
1253 1249
1254 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1255 trigger == IOAPIC_LEVEL) 1251 trigger == IOAPIC_LEVEL) {
1256 irq_set_status_flags(irq, IRQ_LEVEL); 1252 irq_set_status_flags(irq, IRQ_LEVEL);
1257 else 1253 fasteoi = true;
1254 } else {
1258 irq_clear_status_flags(irq, IRQ_LEVEL); 1255 irq_clear_status_flags(irq, IRQ_LEVEL);
1256 fasteoi = false;
1257 }
1259 1258
1260 if (irq_remapped(get_irq_chip_data(irq))) { 1259 if (irq_remapped(cfg)) {
1261 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 1260 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1262 if (trigger) 1261 chip = &ir_ioapic_chip;
1263 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1262 fasteoi = trigger != 0;
1264 handle_fasteoi_irq,
1265 "fasteoi");
1266 else
1267 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1268 handle_edge_irq, "edge");
1269 return;
1270 } 1263 }
1271 1264
1272 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1265 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1273 trigger == IOAPIC_LEVEL) 1266 irq_set_chip_and_handler_name(irq, chip, hdl,
1274 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1267 fasteoi ? "fasteoi" : "edge");
1275 handle_fasteoi_irq,
1276 "fasteoi");
1277 else
1278 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1279 handle_edge_irq, "edge");
1280} 1268}
1281 1269
1282static int setup_ioapic_entry(int apic_id, int irq, 1270static int setup_ioapic_entry(int apic_id, int irq,
@@ -1374,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1374 return; 1362 return;
1375 } 1363 }
1376 1364
1377 ioapic_register_intr(irq, trigger); 1365 ioapic_register_intr(irq, cfg, trigger);
1378 if (irq < legacy_pic->nr_legacy_irqs) 1366 if (irq < legacy_pic->nr_legacy_irqs)
1379 legacy_pic->mask(irq); 1367 legacy_pic->mask(irq);
1380 1368
@@ -1385,33 +1373,26 @@ static struct {
1385 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1386} mp_ioapic_routing[MAX_IO_APICS]; 1374} mp_ioapic_routing[MAX_IO_APICS];
1387 1375
1388static void __init setup_IO_APIC_irqs(void) 1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1389{ 1377{
1390 int apic_id, pin, idx, irq, notcon = 0; 1378 if (idx != -1)
1391 int node = cpu_to_node(0); 1379 return false;
1392 struct irq_cfg *cfg;
1393 1380
1394 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin);
1383 return true;
1384}
1385
1386static void __init __io_apic_setup_irqs(unsigned int apic_id)
1387{
1388 int idx, node = cpu_to_node(0);
1389 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq;
1395 1391
1396 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1397 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1398 idx = find_irq_entry(apic_id, pin, mp_INT); 1393 idx = find_irq_entry(apic_id, pin, mp_INT);
1399 if (idx == -1) { 1394 if (io_apic_pin_not_connected(idx, apic_id, pin))
1400 if (!notcon) {
1401 notcon = 1;
1402 apic_printk(APIC_VERBOSE,
1403 KERN_DEBUG " %d-%d",
1404 mp_ioapics[apic_id].apicid, pin);
1405 } else
1406 apic_printk(APIC_VERBOSE, " %d-%d",
1407 mp_ioapics[apic_id].apicid, pin);
1408 continue; 1395 continue;
1409 }
1410 if (notcon) {
1411 apic_printk(APIC_VERBOSE,
1412 " (apicid-pin) not connected\n");
1413 notcon = 0;
1414 }
1415 1396
1416 irq = pin_2_irq(idx, apic_id, pin); 1397 irq = pin_2_irq(idx, apic_id, pin);
1417 1398
@@ -1423,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void)
1423 * installed and if it returns 1: 1404 * installed and if it returns 1:
1424 */ 1405 */
1425 if (apic->multi_timer_check && 1406 if (apic->multi_timer_check &&
1426 apic->multi_timer_check(apic_id, irq)) 1407 apic->multi_timer_check(apic_id, irq))
1427 continue; 1408 continue;
1428 1409
1429 cfg = alloc_irq_and_cfg_at(irq, node); 1410 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1430 if (!cfg) 1411 irq_polarity(idx));
1431 continue;
1432 1412
1433 add_pin_to_irq_node(cfg, node, apic_id, pin); 1413 io_apic_setup_irq_pin(irq, node, &attr);
1434 /*
1435 * don't mark it in pin_programmed, so later acpi could
1436 * set it correctly when irq < 16
1437 */
1438 setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
1439 irq_polarity(idx));
1440 } 1414 }
1415}
1441 1416
1442 if (notcon) 1417static void __init setup_IO_APIC_irqs(void)
1443 apic_printk(APIC_VERBOSE, 1418{
1444 " (apicid-pin) not connected\n"); 1419 unsigned int apic_id;
1420
1421 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1422
1423 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1424 __io_apic_setup_irqs(apic_id);
1445} 1425}
1446 1426
1447/* 1427/*
@@ -1452,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void)
1452void setup_IO_APIC_irq_extra(u32 gsi) 1432void setup_IO_APIC_irq_extra(u32 gsi)
1453{ 1433{
1454 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); 1434 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1455 struct irq_cfg *cfg; 1435 struct io_apic_irq_attr attr;
1456 1436
1457 /* 1437 /*
1458 * Convert 'gsi' to 'ioapic.pin'. 1438 * Convert 'gsi' to 'ioapic.pin'.
@@ -1472,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1472 if (apic_id == 0 || irq < NR_IRQS_LEGACY) 1452 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1473 return; 1453 return;
1474 1454
1475 cfg = alloc_irq_and_cfg_at(irq, node); 1455 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1476 if (!cfg) 1456 irq_polarity(idx));
1477 return;
1478
1479 add_pin_to_irq_node(cfg, node, apic_id, pin);
1480
1481 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1482 pr_debug("Pin %d-%d already programmed\n",
1483 mp_ioapics[apic_id].apicid, pin);
1484 return;
1485 }
1486 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1487 1457
1488 setup_ioapic_irq(apic_id, pin, irq, cfg, 1458 io_apic_setup_irq_pin_once(irq, node, &attr);
1489 irq_trigger(idx), irq_polarity(idx));
1490} 1459}
1491 1460
1492/* 1461/*
@@ -1518,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1518 * The timer IRQ doesn't have to know that behind the 1487 * The timer IRQ doesn't have to know that behind the
1519 * scene we may have a 8259A-master in AEOI mode ... 1488 * scene we may have a 8259A-master in AEOI mode ...
1520 */ 1489 */
1521 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1490 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1491 "edge");
1522 1492
1523 /* 1493 /*
1524 * Add it to the IO-APIC irq-routing table: 1494 * Add it to the IO-APIC irq-routing table:
@@ -1625,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1625 for_each_active_irq(irq) { 1595 for_each_active_irq(irq) {
1626 struct irq_pin_list *entry; 1596 struct irq_pin_list *entry;
1627 1597
1628 cfg = get_irq_chip_data(irq); 1598 cfg = irq_get_chip_data(irq);
1629 if (!cfg) 1599 if (!cfg)
1630 continue; 1600 continue;
1631 entry = cfg->irq_2_pin; 1601 entry = cfg->irq_2_pin;
@@ -1916,7 +1886,7 @@ void disable_IO_APIC(void)
1916 * 1886 *
1917 * With interrupt-remapping, for now we will use virtual wire A mode, 1887 * With interrupt-remapping, for now we will use virtual wire A mode,
1918 * as virtual wire B is little complex (need to configure both 1888 * as virtual wire B is little complex (need to configure both
1919 * IOAPIC RTE aswell as interrupt-remapping table entry). 1889 * IOAPIC RTE as well as interrupt-remapping table entry).
1920 * As this gets called during crash dump, keep this simple for now. 1890 * As this gets called during crash dump, keep this simple for now.
1921 */ 1891 */
1922 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1892 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2391,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
2391 2361
2392void irq_force_complete_move(int irq) 2362void irq_force_complete_move(int irq)
2393{ 2363{
2394 struct irq_cfg *cfg = get_irq_chip_data(irq); 2364 struct irq_cfg *cfg = irq_get_chip_data(irq);
2395 2365
2396 if (!cfg) 2366 if (!cfg)
2397 return; 2367 return;
@@ -2405,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { }
2405static void ack_apic_edge(struct irq_data *data) 2375static void ack_apic_edge(struct irq_data *data)
2406{ 2376{
2407 irq_complete_move(data->chip_data); 2377 irq_complete_move(data->chip_data);
2408 move_native_irq(data->irq); 2378 irq_move_irq(data);
2409 ack_APIC_irq(); 2379 ack_APIC_irq();
2410} 2380}
2411 2381
@@ -2462,7 +2432,7 @@ static void ack_apic_level(struct irq_data *data)
2462 irq_complete_move(cfg); 2432 irq_complete_move(cfg);
2463#ifdef CONFIG_GENERIC_PENDING_IRQ 2433#ifdef CONFIG_GENERIC_PENDING_IRQ
2464 /* If we are moving the irq we need to mask it */ 2434 /* If we are moving the irq we need to mask it */
2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2435 if (unlikely(irqd_is_setaffinity_pending(data))) {
2466 do_unmask_irq = 1; 2436 do_unmask_irq = 1;
2467 mask_ioapic(cfg); 2437 mask_ioapic(cfg);
2468 } 2438 }
@@ -2551,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data)
2551 * and you can go talk to the chipset vendor about it. 2521 * and you can go talk to the chipset vendor about it.
2552 */ 2522 */
2553 if (!io_apic_level_ack_pending(cfg)) 2523 if (!io_apic_level_ack_pending(cfg))
2554 move_masked_irq(irq); 2524 irq_move_masked_irq(data);
2555 unmask_ioapic(cfg); 2525 unmask_ioapic(cfg);
2556 } 2526 }
2557} 2527}
@@ -2614,7 +2584,7 @@ static inline void init_IO_APIC_traps(void)
2614 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2584 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2615 */ 2585 */
2616 for_each_active_irq(irq) { 2586 for_each_active_irq(irq) {
2617 cfg = get_irq_chip_data(irq); 2587 cfg = irq_get_chip_data(irq);
2618 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2588 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2619 /* 2589 /*
2620 * Hmm.. We don't have an entry for this, 2590 * Hmm.. We don't have an entry for this,
@@ -2625,7 +2595,7 @@ static inline void init_IO_APIC_traps(void)
2625 legacy_pic->make_irq(irq); 2595 legacy_pic->make_irq(irq);
2626 else 2596 else
2627 /* Strange. Oh, well.. */ 2597 /* Strange. Oh, well.. */
2628 set_irq_chip(irq, &no_irq_chip); 2598 irq_set_chip(irq, &no_irq_chip);
2629 } 2599 }
2630 } 2600 }
2631} 2601}
@@ -2665,7 +2635,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2665static void lapic_register_intr(int irq) 2635static void lapic_register_intr(int irq)
2666{ 2636{
2667 irq_clear_status_flags(irq, IRQ_LEVEL); 2637 irq_clear_status_flags(irq, IRQ_LEVEL);
2668 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2638 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2669 "edge"); 2639 "edge");
2670} 2640}
2671 2641
@@ -2749,7 +2719,7 @@ int timer_through_8259 __initdata;
2749 */ 2719 */
2750static inline void __init check_timer(void) 2720static inline void __init check_timer(void)
2751{ 2721{
2752 struct irq_cfg *cfg = get_irq_chip_data(0); 2722 struct irq_cfg *cfg = irq_get_chip_data(0);
2753 int node = cpu_to_node(0); 2723 int node = cpu_to_node(0);
2754 int apic1, pin1, apic2, pin2; 2724 int apic1, pin1, apic2, pin2;
2755 unsigned long flags; 2725 unsigned long flags;
@@ -2935,7 +2905,7 @@ void __init setup_IO_APIC(void)
2935} 2905}
2936 2906
2937/* 2907/*
2938 * Called after all the initialization is done. If we didnt find any 2908 * Called after all the initialization is done. If we didn't find any
2939 * APIC bugs then we can allow the modify fast path 2909 * APIC bugs then we can allow the modify fast path
2940 */ 2910 */
2941 2911
@@ -3060,7 +3030,7 @@ unsigned int create_irq_nr(unsigned int from, int node)
3060 raw_spin_unlock_irqrestore(&vector_lock, flags); 3030 raw_spin_unlock_irqrestore(&vector_lock, flags);
3061 3031
3062 if (ret) { 3032 if (ret) {
3063 set_irq_chip_data(irq, cfg); 3033 irq_set_chip_data(irq, cfg);
3064 irq_clear_status_flags(irq, IRQ_NOREQUEST); 3034 irq_clear_status_flags(irq, IRQ_NOREQUEST);
3065 } else { 3035 } else {
3066 free_irq_at(irq, cfg); 3036 free_irq_at(irq, cfg);
@@ -3085,7 +3055,7 @@ int create_irq(void)
3085 3055
3086void destroy_irq(unsigned int irq) 3056void destroy_irq(unsigned int irq)
3087{ 3057{
3088 struct irq_cfg *cfg = get_irq_chip_data(irq); 3058 struct irq_cfg *cfg = irq_get_chip_data(irq);
3089 unsigned long flags; 3059 unsigned long flags;
3090 3060
3091 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3061 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
@@ -3119,7 +3089,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3119 3089
3120 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3090 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3121 3091
3122 if (irq_remapped(get_irq_chip_data(irq))) { 3092 if (irq_remapped(cfg)) {
3123 struct irte irte; 3093 struct irte irte;
3124 int ir_index; 3094 int ir_index;
3125 u16 sub_handle; 3095 u16 sub_handle;
@@ -3291,6 +3261,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3291 3261
3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3262static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3293{ 3263{
3264 struct irq_chip *chip = &msi_chip;
3294 struct msi_msg msg; 3265 struct msi_msg msg;
3295 int ret; 3266 int ret;
3296 3267
@@ -3298,14 +3269,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3298 if (ret < 0) 3269 if (ret < 0)
3299 return ret; 3270 return ret;
3300 3271
3301 set_irq_msi(irq, msidesc); 3272 irq_set_msi_desc(irq, msidesc);
3302 write_msi_msg(irq, &msg); 3273 write_msi_msg(irq, &msg);
3303 3274
3304 if (irq_remapped(get_irq_chip_data(irq))) { 3275 if (irq_remapped(irq_get_chip_data(irq))) {
3305 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3276 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3306 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3277 chip = &msi_ir_chip;
3307 } else 3278 }
3308 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3279
3280 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3309 3281
3310 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3282 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3311 3283
@@ -3423,8 +3395,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3423 if (ret < 0) 3395 if (ret < 0)
3424 return ret; 3396 return ret;
3425 dmar_msi_write(irq, &msg); 3397 dmar_msi_write(irq, &msg);
3426 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3398 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3427 "edge"); 3399 "edge");
3428 return 0; 3400 return 0;
3429} 3401}
3430#endif 3402#endif
@@ -3482,6 +3454,7 @@ static struct irq_chip hpet_msi_type = {
3482 3454
3483int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3455int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3484{ 3456{
3457 struct irq_chip *chip = &hpet_msi_type;
3485 struct msi_msg msg; 3458 struct msi_msg msg;
3486 int ret; 3459 int ret;
3487 3460
@@ -3501,15 +3474,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3501 if (ret < 0) 3474 if (ret < 0)
3502 return ret; 3475 return ret;
3503 3476
3504 hpet_msi_write(get_irq_data(irq), &msg); 3477 hpet_msi_write(irq_get_handler_data(irq), &msg);
3505 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3478 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3506 if (irq_remapped(get_irq_chip_data(irq))) 3479 if (irq_remapped(irq_get_chip_data(irq)))
3507 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3480 chip = &ir_hpet_msi_type;
3508 handle_edge_irq, "edge");
3509 else
3510 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3511 handle_edge_irq, "edge");
3512 3481
3482 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3513 return 0; 3483 return 0;
3514} 3484}
3515#endif 3485#endif
@@ -3596,7 +3566,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3596 3566
3597 write_ht_irq_msg(irq, &msg); 3567 write_ht_irq_msg(irq, &msg);
3598 3568
3599 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3569 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3600 handle_edge_irq, "edge"); 3570 handle_edge_irq, "edge");
3601 3571
3602 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3572 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3605,7 +3575,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3605} 3575}
3606#endif /* CONFIG_HT_IRQ */ 3576#endif /* CONFIG_HT_IRQ */
3607 3577
3608int __init io_apic_get_redir_entries (int ioapic) 3578int
3579io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3580{
3581 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3582 int ret;
3583
3584 if (!cfg)
3585 return -EINVAL;
3586 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3587 if (!ret)
3588 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3589 attr->trigger, attr->polarity);
3590 return ret;
3591}
3592
3593static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3594 struct io_apic_irq_attr *attr)
3595{
3596 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3597 int ret;
3598
3599 /* Avoid redundant programming */
3600 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
3601 pr_debug("Pin %d-%d already programmed\n",
3602 mp_ioapics[id].apicid, pin);
3603 return 0;
3604 }
3605 ret = io_apic_setup_irq_pin(irq, node, attr);
3606 if (!ret)
3607 set_bit(pin, mp_ioapic_routing[id].pin_programmed);
3608 return ret;
3609}
3610
3611static int __init io_apic_get_redir_entries(int ioapic)
3609{ 3612{
3610 union IO_APIC_reg_01 reg_01; 3613 union IO_APIC_reg_01 reg_01;
3611 unsigned long flags; 3614 unsigned long flags;
@@ -3659,96 +3662,24 @@ int __init arch_probe_nr_irqs(void)
3659} 3662}
3660#endif 3663#endif
3661 3664
3662static int __io_apic_set_pci_routing(struct device *dev, int irq, 3665int io_apic_set_pci_routing(struct device *dev, int irq,
3663 struct io_apic_irq_attr *irq_attr) 3666 struct io_apic_irq_attr *irq_attr)
3664{ 3667{
3665 struct irq_cfg *cfg;
3666 int node; 3668 int node;
3667 int ioapic, pin;
3668 int trigger, polarity;
3669 3669
3670 ioapic = irq_attr->ioapic;
3671 if (!IO_APIC_IRQ(irq)) { 3670 if (!IO_APIC_IRQ(irq)) {
3672 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3671 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3673 ioapic); 3672 irq_attr->ioapic);
3674 return -EINVAL; 3673 return -EINVAL;
3675 } 3674 }
3676 3675
3677 if (dev) 3676 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3678 node = dev_to_node(dev);
3679 else
3680 node = cpu_to_node(0);
3681
3682 cfg = alloc_irq_and_cfg_at(irq, node);
3683 if (!cfg)
3684 return 0;
3685
3686 pin = irq_attr->ioapic_pin;
3687 trigger = irq_attr->trigger;
3688 polarity = irq_attr->polarity;
3689 3677
3690 /* 3678 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3691 * IRQs < 16 are already in the irq_2_pin[] map
3692 */
3693 if (irq >= legacy_pic->nr_legacy_irqs) {
3694 if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
3695 printk(KERN_INFO "can not add pin %d for irq %d\n",
3696 pin, irq);
3697 return 0;
3698 }
3699 }
3700
3701 setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
3702
3703 return 0;
3704} 3679}
3705 3680
3706int io_apic_set_pci_routing(struct device *dev, int irq,
3707 struct io_apic_irq_attr *irq_attr)
3708{
3709 int ioapic, pin;
3710 /*
3711 * Avoid pin reprogramming. PRTs typically include entries
3712 * with redundant pin->gsi mappings (but unique PCI devices);
3713 * we only program the IOAPIC on the first.
3714 */
3715 ioapic = irq_attr->ioapic;
3716 pin = irq_attr->ioapic_pin;
3717 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3718 pr_debug("Pin %d-%d already programmed\n",
3719 mp_ioapics[ioapic].apicid, pin);
3720 return 0;
3721 }
3722 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3723
3724 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3725}
3726
3727u8 __init io_apic_unique_id(u8 id)
3728{
3729#ifdef CONFIG_X86_32 3681#ifdef CONFIG_X86_32
3730 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 3682static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3731 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3732 return io_apic_get_unique_id(nr_ioapics, id);
3733 else
3734 return id;
3735#else
3736 int i;
3737 DECLARE_BITMAP(used, 256);
3738
3739 bitmap_zero(used, 256);
3740 for (i = 0; i < nr_ioapics; i++) {
3741 struct mpc_ioapic *ia = &mp_ioapics[i];
3742 __set_bit(ia->apicid, used);
3743 }
3744 if (!test_bit(id, used))
3745 return id;
3746 return find_first_zero_bit(used, 256);
3747#endif
3748}
3749
3750#ifdef CONFIG_X86_32
3751int __init io_apic_get_unique_id(int ioapic, int apic_id)
3752{ 3683{
3753 union IO_APIC_reg_00 reg_00; 3684 union IO_APIC_reg_00 reg_00;
3754 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3685 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3821,9 +3752,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3821 3752
3822 return apic_id; 3753 return apic_id;
3823} 3754}
3755
3756static u8 __init io_apic_unique_id(u8 id)
3757{
3758 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3759 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3760 return io_apic_get_unique_id(nr_ioapics, id);
3761 else
3762 return id;
3763}
3764#else
3765static u8 __init io_apic_unique_id(u8 id)
3766{
3767 int i;
3768 DECLARE_BITMAP(used, 256);
3769
3770 bitmap_zero(used, 256);
3771 for (i = 0; i < nr_ioapics; i++) {
3772 struct mpc_ioapic *ia = &mp_ioapics[i];
3773 __set_bit(ia->apicid, used);
3774 }
3775 if (!test_bit(id, used))
3776 return id;
3777 return find_first_zero_bit(used, 256);
3778}
3824#endif 3779#endif
3825 3780
3826int __init io_apic_get_version(int ioapic) 3781static int __init io_apic_get_version(int ioapic)
3827{ 3782{
3828 union IO_APIC_reg_01 reg_01; 3783 union IO_APIC_reg_01 reg_01;
3829 unsigned long flags; 3784 unsigned long flags;
@@ -3868,8 +3823,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
3868void __init setup_ioapic_dest(void) 3823void __init setup_ioapic_dest(void)
3869{ 3824{
3870 int pin, ioapic, irq, irq_entry; 3825 int pin, ioapic, irq, irq_entry;
3871 struct irq_desc *desc;
3872 const struct cpumask *mask; 3826 const struct cpumask *mask;
3827 struct irq_data *idata;
3873 3828
3874 if (skip_ioapic_setup == 1) 3829 if (skip_ioapic_setup == 1)
3875 return; 3830 return;
@@ -3884,21 +3839,20 @@ void __init setup_ioapic_dest(void)
3884 if ((ioapic > 0) && (irq > 16)) 3839 if ((ioapic > 0) && (irq > 16))
3885 continue; 3840 continue;
3886 3841
3887 desc = irq_to_desc(irq); 3842 idata = irq_get_irq_data(irq);
3888 3843
3889 /* 3844 /*
3890 * Honour affinities which have been set in early boot 3845 * Honour affinities which have been set in early boot
3891 */ 3846 */
3892 if (desc->status & 3847 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
3893 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3848 mask = idata->affinity;
3894 mask = desc->irq_data.affinity;
3895 else 3849 else
3896 mask = apic->target_cpus(); 3850 mask = apic->target_cpus();
3897 3851
3898 if (intr_remapping_enabled) 3852 if (intr_remapping_enabled)
3899 ir_ioapic_set_affinity(&desc->irq_data, mask, false); 3853 ir_ioapic_set_affinity(idata, mask, false);
3900 else 3854 else
3901 ioapic_set_affinity(&desc->irq_data, mask, false); 3855 ioapic_set_affinity(idata, mask, false);
3902 } 3856 }
3903 3857
3904} 3858}
@@ -4026,10 +3980,10 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
4026 return gsi - mp_gsi_routing[ioapic].gsi_base; 3980 return gsi - mp_gsi_routing[ioapic].gsi_base;
4027} 3981}
4028 3982
4029static int bad_ioapic(unsigned long address) 3983static __init int bad_ioapic(unsigned long address)
4030{ 3984{
4031 if (nr_ioapics >= MAX_IO_APICS) { 3985 if (nr_ioapics >= MAX_IO_APICS) {
4032 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3986 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
4033 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3987 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4034 return 1; 3988 return 1;
4035 } 3989 }
@@ -4086,20 +4040,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4086/* Enable IOAPIC early just for system timer */ 4040/* Enable IOAPIC early just for system timer */
4087void __init pre_init_apic_IRQ0(void) 4041void __init pre_init_apic_IRQ0(void)
4088{ 4042{
4089 struct irq_cfg *cfg; 4043 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4090 4044
4091 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4045 printk(KERN_INFO "Early APIC setup for system timer0\n");
4092#ifndef CONFIG_SMP 4046#ifndef CONFIG_SMP
4093 physid_set_mask_of_physid(boot_cpu_physical_apicid, 4047 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4094 &phys_cpu_present_map); 4048 &phys_cpu_present_map);
4095#endif 4049#endif
4096 /* Make sure the irq descriptor is set up */
4097 cfg = alloc_irq_and_cfg_at(0, 0);
4098
4099 setup_local_APIC(); 4050 setup_local_APIC();
4100 4051
4101 add_pin_to_irq_node(cfg, 0, 0, 0); 4052 io_apic_setup_irq_pin(0, 0, &attr);
4102 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4053 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4103 4054 "edge");
4104 setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
4105} 4055}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 960f26ab5c9f..6273eee5134b 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
373 return physids_promote(0xFUL, retmap); 373 return physids_promote(0xFUL, retmap);
374} 374}
375 375
376static inline int numaq_cpu_to_logical_apicid(int cpu)
377{
378 if (cpu >= nr_cpu_ids)
379 return BAD_APICID;
380 return cpu_2_logical_apicid[cpu];
381}
382
383/* 376/*
384 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 377 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
385 * cpu to APIC ID relation to properly interact with the intelligent 378 * cpu to APIC ID relation to properly interact with the intelligent
@@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
398 return logical_apicid >> 4; 391 return logical_apicid >> 4;
399} 392}
400 393
394static int numaq_numa_cpu_node(int cpu)
395{
396 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
397
398 if (logical_apicid != BAD_APICID)
399 return numaq_apicid_to_node(logical_apicid);
400 return NUMA_NO_NODE;
401}
402
401static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 403static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
402{ 404{
403 int node = numaq_apicid_to_node(logical_apicid); 405 int node = numaq_apicid_to_node(logical_apicid);
@@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = {
508 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 510 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
509 .setup_apic_routing = numaq_setup_apic_routing, 511 .setup_apic_routing = numaq_setup_apic_routing,
510 .multi_timer_check = numaq_multi_timer_check, 512 .multi_timer_check = numaq_multi_timer_check,
511 .apicid_to_node = numaq_apicid_to_node,
512 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
515 .setup_portio_remap = numaq_setup_portio_remap, 515 .setup_portio_remap = numaq_setup_portio_remap,
@@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = {
547 .icr_write = native_apic_icr_write, 547 .icr_write = native_apic_icr_write,
548 .wait_icr_idle = native_apic_wait_icr_idle, 548 .wait_icr_idle = native_apic_wait_icr_idle,
549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
550
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
550}; 553};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..fc84c7b61108 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void)
77 apic->setup_apic_routing(); 77 apic->setup_apic_routing();
78} 78}
79 79
80static int default_x86_32_early_logical_apicid(int cpu)
81{
82 return 1 << cpu;
83}
84
80static void setup_apic_flat_routing(void) 85static void setup_apic_flat_routing(void)
81{ 86{
82#ifdef CONFIG_X86_IO_APIC 87#ifdef CONFIG_X86_IO_APIC
@@ -130,8 +135,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 135 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 136 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 137 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 138 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 139 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 140 .setup_portio_remap = NULL,
@@ -167,6 +170,9 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 170 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 171 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
170}; 176};
171 177
172extern struct apic apic_numaq; 178extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..e4b8059b414a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,7 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
568}; 555};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..90949bbd566d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 206 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 207 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 208 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 209 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 210 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 211 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..c7e6d6645bf4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 195 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 196 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 197 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 198 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 199 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 200 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58b8850..3c289281394c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -338,8 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = {
338 .ioapic_phys_id_map = NULL, 338 .ioapic_phys_id_map = NULL,
339 .setup_apic_routing = NULL, 339 .setup_apic_routing = NULL,
340 .multi_timer_check = NULL, 340 .multi_timer_check = NULL,
341 .apicid_to_node = NULL,
342 .cpu_to_logical_apicid = NULL,
343 .cpu_present_to_apicid = default_cpu_present_to_apicid, 341 .cpu_present_to_apicid = default_cpu_present_to_apicid,
344 .apicid_to_cpu_present = NULL, 342 .apicid_to_cpu_present = NULL,
345 .setup_portio_remap = NULL, 343 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a746..0b4be431c620 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -227,6 +227,7 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -975,20 +976,10 @@ recalc:
975 976
976static void apm_power_off(void) 977static void apm_power_off(void)
977{ 978{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 979 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 980 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 981 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 982 machine_real_restart(MRR_APM);
992 } else { 983 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 984 (void)set_system_power_state(APM_STATE_OFF);
994 } 985 }
@@ -2331,12 +2322,11 @@ static int __init apm_init(void)
2331 apm_info.disabled = 1; 2322 apm_info.disabled = 1;
2332 return -ENODEV; 2323 return -ENODEV;
2333 } 2324 }
2334 if (pm_flags & PM_ACPI) { 2325 if (!acpi_disabled) {
2335 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2326 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2336 apm_info.disabled = 1; 2327 apm_info.disabled = 1;
2337 return -ENODEV; 2328 return -ENODEV;
2338 } 2329 }
2339 pm_flags |= PM_APM;
2340 2330
2341 /* 2331 /*
2342 * Set up the long jump entry point to the APM BIOS, which is called 2332 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2428,7 +2418,6 @@ static void __exit apm_exit(void)
2428 kthread_stop(kapmd_task); 2418 kthread_stop(kapmd_task);
2429 kapmd_task = NULL; 2419 kapmd_task = NULL;
2430 } 2420 }
2431 pm_flags &= ~PM_APM;
2432} 2421}
2433 2422
2434module_init(apm_init); 2423module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37a..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,42 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
103
104 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
105
106#ifdef CONFIG_PARAVIRT
107 BLANK();
108 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
109 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
110 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif
117
118#ifdef CONFIG_XEN
119 BLANK();
120 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
121 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
122#endif
123
124#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
125 BLANK(); 63 BLANK();
126 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -139,11 +77,4 @@ void foo(void)
139 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
140 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
141#endif 79#endif
142
143 BLANK();
144 OFFSET(BP_scratch, boot_params, scratch);
145 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
146 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
147 OFFSET(BP_version, boot_params, hdr.version);
148 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c7bedb83c5a..3ecece0217ef 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid)
261#ifdef CONFIG_X86_HT 265#ifdef CONFIG_X86_HT
262static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) 266static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
263{ 267{
264 u32 nodes; 268 u32 nodes, cores_per_cu = 1;
265 u8 node_id; 269 u8 node_id;
266 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
267 271
@@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
276 /* get compute unit information */ 280 /* get compute unit information */
277 smp_num_siblings = ((ebx >> 8) & 3) + 1; 281 smp_num_siblings = ((ebx >> 8) & 3) + 1;
278 c->compute_unit_id = ebx & 0xff; 282 c->compute_unit_id = ebx & 0xff;
283 cores_per_cu += ((ebx >> 8) & 3);
279 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { 284 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
280 u64 value; 285 u64 value;
281 286
@@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
288 /* fixup multi-node processor information */ 293 /* fixup multi-node processor information */
289 if (nodes > 1) { 294 if (nodes > 1) {
290 u32 cores_per_node; 295 u32 cores_per_node;
296 u32 cus_per_node;
291 297
292 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 298 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
293 cores_per_node = c->x86_max_cores / nodes; 299 cores_per_node = c->x86_max_cores / nodes;
300 cus_per_node = cores_per_node / cores_per_cu;
294 301
295 /* store NodeID, use llc_shared_map to store sibling info */ 302 /* store NodeID, use llc_shared_map to store sibling info */
296 per_cpu(cpu_llc_id, cpu) = node_id; 303 per_cpu(cpu_llc_id, cpu) = node_id;
297 304
298 /* core id to be in range from 0 to (cores_per_node - 1) */ 305 /* core id has to be in the [0 .. cores_per_node - 1] range */
299 c->cpu_core_id = c->cpu_core_id % cores_per_node; 306 c->cpu_core_id %= cores_per_node;
307 c->compute_unit_id %= cus_per_node;
300 } 308 }
301} 309}
302#endif 310#endif
@@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
334 342
335static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
336{ 344{
337#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
338 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
339 int node; 347 int node;
340 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
341 349
342 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
343 353
344 if (apicid_to_node[apicid] != NUMA_NO_NODE)
345 node = apicid_to_node[apicid];
346 if (!node_online(node)) { 354 if (!node_online(node)) {
347 /* Two possibilities here: 355 /*
348 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
349 In that case try picking one from a nearby CPU 357 *
350 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
351 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
352 Assume they are all increased by a constant offset, 360 *
353 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
354 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
355 path for the previous case. */ 363 * they are all increased by a constant offset, but in
356 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
357 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
358 375
359 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
360 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
361 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
362 /* Pick a nearby node */ 379 /* Pick a nearby node */
363 if (!node_online(node)) 380 if (!node_online(node))
364 node = nearby_node(apicid); 381 node = nearby_node(apicid);
@@ -594,6 +611,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
594 } 611 }
595 } 612 }
596#endif 613#endif
614
615 /* As a rule processors have APIC timer running in deep C states */
616 if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
617 set_cpu_cap(c, X86_FEATURE_ARAT);
597} 618}
598 619
599#ifdef CONFIG_X86_32 620#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396bd..e2ced0074a45 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -675,7 +675,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 675 const struct cpu_dev *const *cdev;
676 int count = 0; 676 int count = 0;
677 677
678#ifdef PROCESSOR_SELECT 678#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 679 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 680#endif
681 681
@@ -687,7 +687,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 687 cpu_devs[count] = cpudev;
688 count++; 688 count++;
689 689
690#ifdef PROCESSOR_SELECT 690#ifdef CONFIG_PROCESSOR_SELECT
691 { 691 {
692 unsigned int j; 692 unsigned int j;
693 693
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
869 869
870 select_idle_routine(c); 870 select_idle_routine(c);
871 871
872#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 872#ifdef CONFIG_NUMA
873 numa_add_cpu(smp_processor_id()); 873 numa_add_cpu(smp_processor_id());
874#endif 874#endif
875} 875}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 03162dac6271..cf48cdd6907d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -444,7 +444,7 @@ static int __cpuinit longhaul_get_ranges(void)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* Get max multiplier - as we always did. 446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled. 447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */ 448 * C3 is booting at max anyway. */
449 maxmult = mult; 449 maxmult = mult;
450 /* Get min multiplier */ 450 /* Get min multiplier */
@@ -1011,7 +1011,7 @@ static void __exit longhaul_exit(void)
1011 * trigger frequency transition in some cases. */ 1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644); 1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save 1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */ 1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644); 1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 4a5a42b842ad..755a31e0f5b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
315 315
316 input.count = 4; 316 input.count = 4;
317 input.pointer = in_params; 317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER; 318 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16; 319 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID; 320 in_params[0].buffer.pointer = OSC_UUID;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c567dec854f6..2368e38327b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data)
630 data->powernow_table[j].frequency/1000); 630 data->powernow_table[j].frequency/1000);
631 } else { 631 } else {
632 printk(KERN_INFO PFX 632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n", 633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff, 634 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000, 635 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8); 636 data->powernow_table[j].index >> 8);
@@ -1276,7 +1275,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1276 1275
1277 if (powernow_k8_cpu_init_acpi(data)) { 1276 if (powernow_k8_cpu_init_acpi(data)) {
1278 /* 1277 /*
1279 * Use the PSB BIOS structure. This is only availabe on 1278 * Use the PSB BIOS structure. This is only available on
1280 * an UP version, and is deprecated by AMD. 1279 * an UP version, and is deprecated by AMD.
1281 */ 1280 */
1282 if (num_online_cpus() != 1) { 1281 if (num_online_cpus() != 1) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8abd869baabf..91bc25b67bc1 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 result = speedstep_smi_ownership(); 293 result = speedstep_smi_ownership();
294 if (result) { 294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n"); 295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL; 296 return -EINVAL;
297 } 297 }
298 298
@@ -360,7 +360,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
360 int result = speedstep_smi_ownership(); 360 int result = speedstep_smi_ownership();
361 361
362 if (result) 362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n"); 363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364 364
365 return result; 365 return result;
366} 366}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d16c2c53d6bf..df86bc8c859d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 276
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 278{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 279#ifdef CONFIG_NUMA
280 unsigned node; 280 unsigned node;
281 int cpu = smp_processor_id(); 281 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 282
284 /* Don't do the funky fallback heuristics the AMD version employs 283 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 284 for now. */
286 node = apicid_to_node[apicid]; 285 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE || !node_online(node)) { 286 if (node == NUMA_NO_NODE || !node_online(node)) {
288 /* reuse the value from init_cpu_to_node() */ 287 /* reuse the value from init_cpu_to_node() */
289 node = cpu_to_node(cpu); 288 node = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ec2c19a7b8ef..1ce1af2899df 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -304,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
304 304
305struct _cache_attr { 305struct _cache_attr {
306 struct attribute attr; 306 struct attribute attr;
307 ssize_t (*show)(struct _cpuid4_info *, char *); 307 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
309 unsigned int);
309}; 310};
310 311
311#ifdef CONFIG_AMD_NB 312#ifdef CONFIG_AMD_NB
@@ -400,7 +401,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 401
401#define SHOW_CACHE_DISABLE(slot) \ 402#define SHOW_CACHE_DISABLE(slot) \
402static ssize_t \ 403static ssize_t \
403show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ 404show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
405 unsigned int cpu) \
404{ \ 406{ \
405 return show_cache_disable(this_leaf, buf, slot); \ 407 return show_cache_disable(this_leaf, buf, slot); \
406} 408}
@@ -512,7 +514,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
512#define STORE_CACHE_DISABLE(slot) \ 514#define STORE_CACHE_DISABLE(slot) \
513static ssize_t \ 515static ssize_t \
514store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 516store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
515 const char *buf, size_t count) \ 517 const char *buf, size_t count, \
518 unsigned int cpu) \
516{ \ 519{ \
517 return store_cache_disable(this_leaf, buf, count, slot); \ 520 return store_cache_disable(this_leaf, buf, count, slot); \
518} 521}
@@ -524,6 +527,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
524static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 527static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
525 show_cache_disable_1, store_cache_disable_1); 528 show_cache_disable_1, store_cache_disable_1);
526 529
530static ssize_t
531show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
532{
533 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
534 return -EINVAL;
535
536 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
537}
538
539static ssize_t
540store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
541 unsigned int cpu)
542{
543 unsigned long val;
544
545 if (!capable(CAP_SYS_ADMIN))
546 return -EPERM;
547
548 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
549 return -EINVAL;
550
551 if (strict_strtoul(buf, 16, &val) < 0)
552 return -EINVAL;
553
554 if (amd_set_subcaches(cpu, val))
555 return -EINVAL;
556
557 return count;
558}
559
560static struct _cache_attr subcaches =
561 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
562
527#else /* CONFIG_AMD_NB */ 563#else /* CONFIG_AMD_NB */
528#define amd_init_l3_cache(x, y) 564#define amd_init_l3_cache(x, y)
529#endif /* CONFIG_AMD_NB */ 565#endif /* CONFIG_AMD_NB */
@@ -532,9 +568,9 @@ static int
532__cpuinit cpuid4_cache_lookup_regs(int index, 568__cpuinit cpuid4_cache_lookup_regs(int index,
533 struct _cpuid4_info_regs *this_leaf) 569 struct _cpuid4_info_regs *this_leaf)
534{ 570{
535 union _cpuid4_leaf_eax eax; 571 union _cpuid4_leaf_eax eax;
536 union _cpuid4_leaf_ebx ebx; 572 union _cpuid4_leaf_ebx ebx;
537 union _cpuid4_leaf_ecx ecx; 573 union _cpuid4_leaf_ecx ecx;
538 unsigned edx; 574 unsigned edx;
539 575
540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 576 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
@@ -732,11 +768,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
732 struct cpuinfo_x86 *c = &cpu_data(cpu); 768 struct cpuinfo_x86 *c = &cpu_data(cpu);
733 769
734 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 770 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
735 for_each_cpu(i, c->llc_shared_map) { 771 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
736 if (!per_cpu(ici_cpuid4_info, i)) 772 if (!per_cpu(ici_cpuid4_info, i))
737 continue; 773 continue;
738 this_leaf = CPUID4_INFO_IDX(i, index); 774 this_leaf = CPUID4_INFO_IDX(i, index);
739 for_each_cpu(sibling, c->llc_shared_map) { 775 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
740 if (!cpu_online(sibling)) 776 if (!cpu_online(sibling))
741 continue; 777 continue;
742 set_bit(sibling, this_leaf->shared_cpu_map); 778 set_bit(sibling, this_leaf->shared_cpu_map);
@@ -870,8 +906,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
870#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) 906#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
871 907
872#define show_one_plus(file_name, object, val) \ 908#define show_one_plus(file_name, object, val) \
873static ssize_t show_##file_name \ 909static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
874 (struct _cpuid4_info *this_leaf, char *buf) \ 910 unsigned int cpu) \
875{ \ 911{ \
876 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 912 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
877} 913}
@@ -882,7 +918,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
882show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); 918show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
883show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); 919show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
884 920
885static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 921static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
922 unsigned int cpu)
886{ 923{
887 return sprintf(buf, "%luK\n", this_leaf->size / 1024); 924 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
888} 925}
@@ -906,17 +943,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
906 return n; 943 return n;
907} 944}
908 945
909static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) 946static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
947 unsigned int cpu)
910{ 948{
911 return show_shared_cpu_map_func(leaf, 0, buf); 949 return show_shared_cpu_map_func(leaf, 0, buf);
912} 950}
913 951
914static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) 952static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
953 unsigned int cpu)
915{ 954{
916 return show_shared_cpu_map_func(leaf, 1, buf); 955 return show_shared_cpu_map_func(leaf, 1, buf);
917} 956}
918 957
919static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) 958static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
959 unsigned int cpu)
920{ 960{
921 switch (this_leaf->eax.split.type) { 961 switch (this_leaf->eax.split.type) {
922 case CACHE_TYPE_DATA: 962 case CACHE_TYPE_DATA:
@@ -974,6 +1014,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
974 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 1014 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
975 n += 2; 1015 n += 2;
976 1016
1017 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1018 n += 1;
1019
977 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); 1020 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
978 if (attrs == NULL) 1021 if (attrs == NULL)
979 return attrs = default_attrs; 1022 return attrs = default_attrs;
@@ -986,6 +1029,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
986 attrs[n++] = &cache_disable_1.attr; 1029 attrs[n++] = &cache_disable_1.attr;
987 } 1030 }
988 1031
1032 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1033 attrs[n++] = &subcaches.attr;
1034
989 return attrs; 1035 return attrs;
990} 1036}
991#endif 1037#endif
@@ -998,7 +1044,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
998 1044
999 ret = fattr->show ? 1045 ret = fattr->show ?
1000 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1046 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1001 buf) : 1047 buf, this_leaf->cpu) :
1002 0; 1048 0;
1003 return ret; 1049 return ret;
1004} 1050}
@@ -1012,7 +1058,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
1012 1058
1013 ret = fattr->store ? 1059 ret = fattr->store ?
1014 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1060 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1015 buf, count) : 1061 buf, count, this_leaf->cpu) :
1016 0; 1062 0;
1017 return ret; 1063 return ret;
1018} 1064}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a77971979564..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -32,7 +32,7 @@ static void inject_mce(struct mce *m)
32{ 32{
33 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
34 34
35 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
36 i->finished = 0; 36 i->finished = 0;
37 mb(); 37 mb();
38 m->finished = 0; 38 m->finished = 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183b7f9c..ab1122998dba 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -881,7 +881,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 881 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 882 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 883 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 884 * parser). So only support physical addresses up to page granuality for now.
885 */ 885 */
886static int mce_usable_address(struct mce *m) 886static int mce_usable_address(struct mce *m)
887{ 887{
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac52aca..167f97b5596e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 int i, err = 0; 527 int i, err = 0;
528 struct threshold_bank *b = NULL; 528 struct threshold_bank *b = NULL;
529 char name[32]; 529 char name[32];
530#ifdef CONFIG_SMP
531 struct cpuinfo_x86 *c = &cpu_data(cpu);
532#endif
533 530
534 sprintf(name, "threshold_bank%i", bank); 531 sprintf(name, "threshold_bank%i", bank);
535 532
536#ifdef CONFIG_SMP 533#ifdef CONFIG_SMP
537 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 534 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
538 i = cpumask_first(c->llc_shared_map); 535 i = cpumask_first(cpu_llc_shared_mask(cpu));
539 536
540 /* first core not up yet */ 537 /* first core not up yet */
541 if (cpu_data(i).cpu_core_id) 538 if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
555 if (err) 552 if (err)
556 goto out; 553 goto out;
557 554
558 cpumask_copy(b->cpus, c->llc_shared_map); 555 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
559 per_cpu(threshold_banks, cpu)[bank] = b; 556 per_cpu(threshold_banks, cpu)[bank] = b;
560 557
561 goto out; 558 goto out;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9f27228ceffd..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea693..87eab4a27dfc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,7 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
33 34
34#if 0 35#if 0
35#undef wrmsrl 36#undef wrmsrl
@@ -93,6 +94,8 @@ struct amd_nb {
93 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
94}; 95};
95 96
97struct intel_percore;
98
96#define MAX_LBR_ENTRIES 16 99#define MAX_LBR_ENTRIES 16
97 100
98struct cpu_hw_events { 101struct cpu_hw_events {
@@ -128,6 +131,13 @@ struct cpu_hw_events {
128 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 131 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
129 132
130 /* 133 /*
134 * Intel percore register state.
135 * Coordinate shared resources between HT threads.
136 */
137 int percore_used; /* Used by this CPU? */
138 struct intel_percore *per_core;
139
140 /*
131 * AMD specific bits 141 * AMD specific bits
132 */ 142 */
133 struct amd_nb *amd_nb; 143 struct amd_nb *amd_nb;
@@ -166,7 +176,7 @@ struct cpu_hw_events {
166/* 176/*
167 * Constraint on the Event code + UMask 177 * Constraint on the Event code + UMask
168 */ 178 */
169#define PEBS_EVENT_CONSTRAINT(c, n) \ 179#define INTEL_UEVENT_CONSTRAINT(c, n) \
170 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 180 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
171 181
172#define EVENT_CONSTRAINT_END \ 182#define EVENT_CONSTRAINT_END \
@@ -175,6 +185,28 @@ struct cpu_hw_events {
175#define for_each_event_constraint(e, c) \ 185#define for_each_event_constraint(e, c) \
176 for ((e) = (c); (e)->weight; (e)++) 186 for ((e) = (c); (e)->weight; (e)++)
177 187
188/*
189 * Extra registers for specific events.
190 * Some events need large masks and require external MSRs.
191 * Define a mapping to these extra registers.
192 */
193struct extra_reg {
194 unsigned int event;
195 unsigned int msr;
196 u64 config_mask;
197 u64 valid_mask;
198};
199
200#define EVENT_EXTRA_REG(e, ms, m, vm) { \
201 .event = (e), \
202 .msr = (ms), \
203 .config_mask = (m), \
204 .valid_mask = (vm), \
205 }
206#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
207 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
208#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
209
178union perf_capabilities { 210union perf_capabilities {
179 struct { 211 struct {
180 u64 lbr_format : 6; 212 u64 lbr_format : 6;
@@ -219,6 +251,7 @@ struct x86_pmu {
219 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 251 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
220 struct perf_event *event); 252 struct perf_event *event);
221 struct event_constraint *event_constraints; 253 struct event_constraint *event_constraints;
254 struct event_constraint *percore_constraints;
222 void (*quirks)(void); 255 void (*quirks)(void);
223 int perfctr_second_write; 256 int perfctr_second_write;
224 257
@@ -247,6 +280,11 @@ struct x86_pmu {
247 */ 280 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 281 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 282 int lbr_nr; /* hardware stack size */
283
284 /*
285 * Extra registers for events
286 */
287 struct extra_reg *extra_regs;
250}; 288};
251 289
252static struct x86_pmu x86_pmu __read_mostly; 290static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +309,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 309 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 310 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 311 [PERF_COUNT_HW_CACHE_RESULT_MAX];
312static u64 __read_mostly hw_cache_extra_regs
313 [PERF_COUNT_HW_CACHE_MAX]
314 [PERF_COUNT_HW_CACHE_OP_MAX]
315 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 316
275/* 317/*
276 * Propagate event elapsed time into the generic event. 318 * Propagate event elapsed time into the generic event.
@@ -298,7 +340,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 340 */
299again: 341again:
300 prev_raw_count = local64_read(&hwc->prev_count); 342 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 343 rdmsrl(hwc->event_base, new_raw_count);
302 344
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 345 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 346 new_raw_count) != prev_raw_count)
@@ -321,6 +363,49 @@ again:
321 return new_raw_count; 363 return new_raw_count;
322} 364}
323 365
366/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
367static inline int x86_pmu_addr_offset(int index)
368{
369 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
370 return index << 1;
371 return index;
372}
373
374static inline unsigned int x86_pmu_config_addr(int index)
375{
376 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
377}
378
379static inline unsigned int x86_pmu_event_addr(int index)
380{
381 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
382}
383
384/*
385 * Find and validate any extra registers to set up.
386 */
387static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
388{
389 struct extra_reg *er;
390
391 event->hw.extra_reg = 0;
392 event->hw.extra_config = 0;
393
394 if (!x86_pmu.extra_regs)
395 return 0;
396
397 for (er = x86_pmu.extra_regs; er->msr; er++) {
398 if (er->event != (config & er->config_mask))
399 continue;
400 if (event->attr.config1 & ~er->valid_mask)
401 return -EINVAL;
402 event->hw.extra_reg = er->msr;
403 event->hw.extra_config = event->attr.config1;
404 break;
405 }
406 return 0;
407}
408
324static atomic_t active_events; 409static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 410static DEFINE_MUTEX(pmc_reserve_mutex);
326 411
@@ -331,12 +416,12 @@ static bool reserve_pmc_hardware(void)
331 int i; 416 int i;
332 417
333 for (i = 0; i < x86_pmu.num_counters; i++) { 418 for (i = 0; i < x86_pmu.num_counters; i++) {
334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 419 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
335 goto perfctr_fail; 420 goto perfctr_fail;
336 } 421 }
337 422
338 for (i = 0; i < x86_pmu.num_counters; i++) { 423 for (i = 0; i < x86_pmu.num_counters; i++) {
339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 424 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
340 goto eventsel_fail; 425 goto eventsel_fail;
341 } 426 }
342 427
@@ -344,13 +429,13 @@ static bool reserve_pmc_hardware(void)
344 429
345eventsel_fail: 430eventsel_fail:
346 for (i--; i >= 0; i--) 431 for (i--; i >= 0; i--)
347 release_evntsel_nmi(x86_pmu.eventsel + i); 432 release_evntsel_nmi(x86_pmu_config_addr(i));
348 433
349 i = x86_pmu.num_counters; 434 i = x86_pmu.num_counters;
350 435
351perfctr_fail: 436perfctr_fail:
352 for (i--; i >= 0; i--) 437 for (i--; i >= 0; i--)
353 release_perfctr_nmi(x86_pmu.perfctr + i); 438 release_perfctr_nmi(x86_pmu_event_addr(i));
354 439
355 return false; 440 return false;
356} 441}
@@ -360,8 +445,8 @@ static void release_pmc_hardware(void)
360 int i; 445 int i;
361 446
362 for (i = 0; i < x86_pmu.num_counters; i++) { 447 for (i = 0; i < x86_pmu.num_counters; i++) {
363 release_perfctr_nmi(x86_pmu.perfctr + i); 448 release_perfctr_nmi(x86_pmu_event_addr(i));
364 release_evntsel_nmi(x86_pmu.eventsel + i); 449 release_evntsel_nmi(x86_pmu_config_addr(i));
365 } 450 }
366} 451}
367 452
@@ -382,7 +467,7 @@ static bool check_hw_exists(void)
382 * complain and bail. 467 * complain and bail.
383 */ 468 */
384 for (i = 0; i < x86_pmu.num_counters; i++) { 469 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i; 470 reg = x86_pmu_config_addr(i);
386 ret = rdmsrl_safe(reg, &val); 471 ret = rdmsrl_safe(reg, &val);
387 if (ret) 472 if (ret)
388 goto msr_fail; 473 goto msr_fail;
@@ -407,8 +492,8 @@ static bool check_hw_exists(void)
407 * that don't trap on the MSR access and always return 0s. 492 * that don't trap on the MSR access and always return 0s.
408 */ 493 */
409 val = 0xabcdUL; 494 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val); 495 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); 496 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
412 if (ret || val != val_new) 497 if (ret || val != val_new)
413 goto msr_fail; 498 goto msr_fail;
414 499
@@ -442,8 +527,9 @@ static inline int x86_pmu_initialized(void)
442} 527}
443 528
444static inline int 529static inline int
445set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 530set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
446{ 531{
532 struct perf_event_attr *attr = &event->attr;
447 unsigned int cache_type, cache_op, cache_result; 533 unsigned int cache_type, cache_op, cache_result;
448 u64 config, val; 534 u64 config, val;
449 535
@@ -470,8 +556,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
470 return -EINVAL; 556 return -EINVAL;
471 557
472 hwc->config |= val; 558 hwc->config |= val;
473 559 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
474 return 0; 560 return x86_pmu_extra_regs(val, event);
475} 561}
476 562
477static int x86_setup_perfctr(struct perf_event *event) 563static int x86_setup_perfctr(struct perf_event *event)
@@ -496,10 +582,10 @@ static int x86_setup_perfctr(struct perf_event *event)
496 } 582 }
497 583
498 if (attr->type == PERF_TYPE_RAW) 584 if (attr->type == PERF_TYPE_RAW)
499 return 0; 585 return x86_pmu_extra_regs(event->attr.config, event);
500 586
501 if (attr->type == PERF_TYPE_HW_CACHE) 587 if (attr->type == PERF_TYPE_HW_CACHE)
502 return set_ext_hw_attr(hwc, attr); 588 return set_ext_hw_attr(hwc, event);
503 589
504 if (attr->config >= x86_pmu.max_events) 590 if (attr->config >= x86_pmu.max_events)
505 return -EINVAL; 591 return -EINVAL;
@@ -617,11 +703,11 @@ static void x86_pmu_disable_all(void)
617 703
618 if (!test_bit(idx, cpuc->active_mask)) 704 if (!test_bit(idx, cpuc->active_mask))
619 continue; 705 continue;
620 rdmsrl(x86_pmu.eventsel + idx, val); 706 rdmsrl(x86_pmu_config_addr(idx), val);
621 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 707 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
622 continue; 708 continue;
623 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 709 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
624 wrmsrl(x86_pmu.eventsel + idx, val); 710 wrmsrl(x86_pmu_config_addr(idx), val);
625 } 711 }
626} 712}
627 713
@@ -642,21 +728,26 @@ static void x86_pmu_disable(struct pmu *pmu)
642 x86_pmu.disable_all(); 728 x86_pmu.disable_all();
643} 729}
644 730
731static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
732 u64 enable_mask)
733{
734 if (hwc->extra_reg)
735 wrmsrl(hwc->extra_reg, hwc->extra_config);
736 wrmsrl(hwc->config_base, hwc->config | enable_mask);
737}
738
645static void x86_pmu_enable_all(int added) 739static void x86_pmu_enable_all(int added)
646{ 740{
647 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 741 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
648 int idx; 742 int idx;
649 743
650 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 744 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
651 struct perf_event *event = cpuc->events[idx]; 745 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
652 u64 val;
653 746
654 if (!test_bit(idx, cpuc->active_mask)) 747 if (!test_bit(idx, cpuc->active_mask))
655 continue; 748 continue;
656 749
657 val = event->hw.config; 750 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
658 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
659 wrmsrl(x86_pmu.eventsel + idx, val);
660 } 751 }
661} 752}
662 753
@@ -821,15 +912,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
821 hwc->event_base = 0; 912 hwc->event_base = 0;
822 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 913 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
823 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 914 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
824 /* 915 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0;
825 * We set it so that event_base + idx in wrmsr/rdmsr maps to
826 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
827 */
828 hwc->event_base =
829 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
830 } else { 916 } else {
831 hwc->config_base = x86_pmu.eventsel; 917 hwc->config_base = x86_pmu_config_addr(hwc->idx);
832 hwc->event_base = x86_pmu.perfctr; 918 hwc->event_base = x86_pmu_event_addr(hwc->idx);
833 } 919 }
834} 920}
835 921
@@ -915,17 +1001,11 @@ static void x86_pmu_enable(struct pmu *pmu)
915 x86_pmu.enable_all(added); 1001 x86_pmu.enable_all(added);
916} 1002}
917 1003
918static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
919 u64 enable_mask)
920{
921 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
922}
923
924static inline void x86_pmu_disable_event(struct perf_event *event) 1004static inline void x86_pmu_disable_event(struct perf_event *event)
925{ 1005{
926 struct hw_perf_event *hwc = &event->hw; 1006 struct hw_perf_event *hwc = &event->hw;
927 1007
928 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1008 wrmsrl(hwc->config_base, hwc->config);
929} 1009}
930 1010
931static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1011static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -978,7 +1058,7 @@ x86_perf_event_set_period(struct perf_event *event)
978 */ 1058 */
979 local64_set(&hwc->prev_count, (u64)-left); 1059 local64_set(&hwc->prev_count, (u64)-left);
980 1060
981 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1061 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
982 1062
983 /* 1063 /*
984 * Due to erratum on certan cpu we need 1064 * Due to erratum on certan cpu we need
@@ -986,7 +1066,7 @@ x86_perf_event_set_period(struct perf_event *event)
986 * is updated properly 1066 * is updated properly
987 */ 1067 */
988 if (x86_pmu.perfctr_second_write) { 1068 if (x86_pmu.perfctr_second_write) {
989 wrmsrl(hwc->event_base + idx, 1069 wrmsrl(hwc->event_base,
990 (u64)(-left) & x86_pmu.cntval_mask); 1070 (u64)(-left) & x86_pmu.cntval_mask);
991 } 1071 }
992 1072
@@ -1029,7 +1109,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1029 1109
1030 /* 1110 /*
1031 * If group events scheduling transaction was started, 1111 * If group events scheduling transaction was started,
1032 * skip the schedulability test here, it will be peformed 1112 * skip the schedulability test here, it will be performed
1033 * at commit time (->commit_txn) as a whole 1113 * at commit time (->commit_txn) as a whole
1034 */ 1114 */
1035 if (cpuc->group_flag & PERF_EVENT_TXN) 1115 if (cpuc->group_flag & PERF_EVENT_TXN)
@@ -1113,8 +1193,8 @@ void perf_event_print_debug(void)
1113 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1193 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1114 1194
1115 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1195 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1116 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1196 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1117 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1197 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1118 1198
1119 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1199 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1120 1200
@@ -1389,7 +1469,7 @@ static void __init pmu_check_apic(void)
1389 pr_info("no hardware sampling interrupt available.\n"); 1469 pr_info("no hardware sampling interrupt available.\n");
1390} 1470}
1391 1471
1392int __init init_hw_perf_events(void) 1472static int __init init_hw_perf_events(void)
1393{ 1473{
1394 struct event_constraint *c; 1474 struct event_constraint *c;
1395 int err; 1475 int err;
@@ -1608,7 +1688,7 @@ out:
1608 return ret; 1688 return ret;
1609} 1689}
1610 1690
1611int x86_pmu_event_init(struct perf_event *event) 1691static int x86_pmu_event_init(struct perf_event *event)
1612{ 1692{
1613 struct pmu *tmp; 1693 struct pmu *tmp;
1614 int err; 1694 int err;
@@ -1710,7 +1790,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1710 1790
1711 perf_callchain_store(entry, regs->ip); 1791 perf_callchain_store(entry, regs->ip);
1712 1792
1713 dump_trace(NULL, regs, NULL, &backtrace_ops, entry); 1793 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1714} 1794}
1715 1795
1716#ifdef CONFIG_COMPAT 1796#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67e2202a6039..461f62bbd774 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
127/* 127/*
128 * AMD64 events are detected based on their event codes. 128 * AMD64 events are detected based on their event codes.
129 */ 129 */
130static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
131{
132 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
133}
134
130static inline int amd_is_nb_event(struct hw_perf_event *hwc) 135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
131{ 136{
132 return (hwc->config & 0xe0) == 0xe0; 137 return (hwc->config & 0xe0) == 0xe0;
@@ -385,13 +390,181 @@ static __initconst const struct x86_pmu amd_pmu = {
385 .cpu_dead = amd_pmu_cpu_dead, 390 .cpu_dead = amd_pmu_cpu_dead,
386}; 391};
387 392
393/* AMD Family 15h */
394
395#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
396
397#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
398#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
399#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
400#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
401#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
402#define AMD_EVENT_EX_LS 0x000000C0ULL
403#define AMD_EVENT_DE 0x000000D0ULL
404#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
405
406/*
407 * AMD family 15h event code/PMC mappings:
408 *
409 * type = event_code & 0x0F0:
410 *
411 * 0x000 FP PERF_CTL[5:3]
412 * 0x010 FP PERF_CTL[5:3]
413 * 0x020 LS PERF_CTL[5:0]
414 * 0x030 LS PERF_CTL[5:0]
415 * 0x040 DC PERF_CTL[5:0]
416 * 0x050 DC PERF_CTL[5:0]
417 * 0x060 CU PERF_CTL[2:0]
418 * 0x070 CU PERF_CTL[2:0]
419 * 0x080 IC/DE PERF_CTL[2:0]
420 * 0x090 IC/DE PERF_CTL[2:0]
421 * 0x0A0 ---
422 * 0x0B0 ---
423 * 0x0C0 EX/LS PERF_CTL[5:0]
424 * 0x0D0 DE PERF_CTL[2:0]
425 * 0x0E0 NB NB_PERF_CTL[3:0]
426 * 0x0F0 NB NB_PERF_CTL[3:0]
427 *
428 * Exceptions:
429 *
430 * 0x003 FP PERF_CTL[3]
431 * 0x00B FP PERF_CTL[3]
432 * 0x00D FP PERF_CTL[3]
433 * 0x023 DE PERF_CTL[2:0]
434 * 0x02D LS PERF_CTL[3]
435 * 0x02E LS PERF_CTL[3,0]
436 * 0x043 CU PERF_CTL[2:0]
437 * 0x045 CU PERF_CTL[2:0]
438 * 0x046 CU PERF_CTL[2:0]
439 * 0x054 CU PERF_CTL[2:0]
440 * 0x055 CU PERF_CTL[2:0]
441 * 0x08F IC PERF_CTL[0]
442 * 0x187 DE PERF_CTL[0]
443 * 0x188 DE PERF_CTL[0]
444 * 0x0DB EX PERF_CTL[5:0]
445 * 0x0DC LS PERF_CTL[5:0]
446 * 0x0DD LS PERF_CTL[5:0]
447 * 0x0DE LS PERF_CTL[5:0]
448 * 0x0DF LS PERF_CTL[5:0]
449 * 0x1D6 EX PERF_CTL[5:0]
450 * 0x1D8 EX PERF_CTL[5:0]
451 */
452
453static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
454static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
455static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
456static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
457static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
458static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
459
460static struct event_constraint *
461amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
462{
463 unsigned int event_code = amd_get_event_code(&event->hw);
464
465 switch (event_code & AMD_EVENT_TYPE_MASK) {
466 case AMD_EVENT_FP:
467 switch (event_code) {
468 case 0x003:
469 case 0x00B:
470 case 0x00D:
471 return &amd_f15_PMC3;
472 default:
473 return &amd_f15_PMC53;
474 }
475 case AMD_EVENT_LS:
476 case AMD_EVENT_DC:
477 case AMD_EVENT_EX_LS:
478 switch (event_code) {
479 case 0x023:
480 case 0x043:
481 case 0x045:
482 case 0x046:
483 case 0x054:
484 case 0x055:
485 return &amd_f15_PMC20;
486 case 0x02D:
487 return &amd_f15_PMC3;
488 case 0x02E:
489 return &amd_f15_PMC30;
490 default:
491 return &amd_f15_PMC50;
492 }
493 case AMD_EVENT_CU:
494 case AMD_EVENT_IC_DE:
495 case AMD_EVENT_DE:
496 switch (event_code) {
497 case 0x08F:
498 case 0x187:
499 case 0x188:
500 return &amd_f15_PMC0;
501 case 0x0DB ... 0x0DF:
502 case 0x1D6:
503 case 0x1D8:
504 return &amd_f15_PMC50;
505 default:
506 return &amd_f15_PMC20;
507 }
508 case AMD_EVENT_NB:
509 /* not yet implemented */
510 return &emptyconstraint;
511 default:
512 return &emptyconstraint;
513 }
514}
515
516static __initconst const struct x86_pmu amd_pmu_f15h = {
517 .name = "AMD Family 15h",
518 .handle_irq = x86_pmu_handle_irq,
519 .disable_all = x86_pmu_disable_all,
520 .enable_all = x86_pmu_enable_all,
521 .enable = x86_pmu_enable_event,
522 .disable = x86_pmu_disable_event,
523 .hw_config = amd_pmu_hw_config,
524 .schedule_events = x86_schedule_events,
525 .eventsel = MSR_F15H_PERF_CTL,
526 .perfctr = MSR_F15H_PERF_CTR,
527 .event_map = amd_pmu_event_map,
528 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
529 .num_counters = 6,
530 .cntval_bits = 48,
531 .cntval_mask = (1ULL << 48) - 1,
532 .apic = 1,
533 /* use highest bit to detect overflow */
534 .max_period = (1ULL << 47) - 1,
535 .get_event_constraints = amd_get_event_constraints_f15h,
536 /* nortbridge counters not yet implemented: */
537#if 0
538 .put_event_constraints = amd_put_event_constraints,
539
540 .cpu_prepare = amd_pmu_cpu_prepare,
541 .cpu_starting = amd_pmu_cpu_starting,
542 .cpu_dead = amd_pmu_cpu_dead,
543#endif
544};
545
388static __init int amd_pmu_init(void) 546static __init int amd_pmu_init(void)
389{ 547{
390 /* Performance-monitoring supported from K7 and later: */ 548 /* Performance-monitoring supported from K7 and later: */
391 if (boot_cpu_data.x86 < 6) 549 if (boot_cpu_data.x86 < 6)
392 return -ENODEV; 550 return -ENODEV;
393 551
394 x86_pmu = amd_pmu; 552 /*
553 * If core performance counter extensions exists, it must be
554 * family 15h, otherwise fail. See x86_pmu_addr_offset().
555 */
556 switch (boot_cpu_data.x86) {
557 case 0x15:
558 if (!cpu_has_perfctr_core)
559 return -ENODEV;
560 x86_pmu = amd_pmu_f15h;
561 break;
562 default:
563 if (cpu_has_perfctr_core)
564 return -ENODEV;
565 x86_pmu = amd_pmu;
566 break;
567 }
395 568
396 /* Events are common for all AMDs */ 569 /* Events are common for all AMDs */
397 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 570 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c1d79c..8fc2b2cee1da 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,27 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
89static struct extra_reg intel_nehalem_extra_regs[] =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
67static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +110,33 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
113static struct event_constraint intel_snb_event_constraints[] =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
79static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +150,106 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 /*
188 * TBD: Need Off-core Response Performance Monitoring support
189 */
190 [ C(OP_READ) ] = {
191 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
192 [ C(RESULT_ACCESS) ] = 0x01b7,
193 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
194 [ C(RESULT_MISS) ] = 0x01bb,
195 },
196 [ C(OP_WRITE) ] = {
197 /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
198 [ C(RESULT_ACCESS) ] = 0x01b7,
199 /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
200 [ C(RESULT_MISS) ] = 0x01bb,
201 },
202 [ C(OP_PREFETCH) ] = {
203 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
204 [ C(RESULT_ACCESS) ] = 0x01b7,
205 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
206 [ C(RESULT_MISS) ] = 0x01bb,
207 },
208 },
209 [ C(DTLB) ] = {
210 [ C(OP_READ) ] = {
211 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
212 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
213 },
214 [ C(OP_WRITE) ] = {
215 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
216 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
217 },
218 [ C(OP_PREFETCH) ] = {
219 [ C(RESULT_ACCESS) ] = 0x0,
220 [ C(RESULT_MISS) ] = 0x0,
221 },
222 },
223 [ C(ITLB) ] = {
224 [ C(OP_READ) ] = {
225 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
226 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
227 },
228 [ C(OP_WRITE) ] = {
229 [ C(RESULT_ACCESS) ] = -1,
230 [ C(RESULT_MISS) ] = -1,
231 },
232 [ C(OP_PREFETCH) ] = {
233 [ C(RESULT_ACCESS) ] = -1,
234 [ C(RESULT_MISS) ] = -1,
235 },
236 },
237 [ C(BPU ) ] = {
238 [ C(OP_READ) ] = {
239 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
240 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
241 },
242 [ C(OP_WRITE) ] = {
243 [ C(RESULT_ACCESS) ] = -1,
244 [ C(RESULT_MISS) ] = -1,
245 },
246 [ C(OP_PREFETCH) ] = {
247 [ C(RESULT_ACCESS) ] = -1,
248 [ C(RESULT_MISS) ] = -1,
249 },
250 },
251};
252
92static __initconst const u64 westmere_hw_cache_event_ids 253static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 254 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 255 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +285,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 285 },
125 [ C(LL ) ] = { 286 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 287 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 288 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 289 [ C(RESULT_ACCESS) ] = 0x01b7,
290 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
291 [ C(RESULT_MISS) ] = 0x01bb,
129 }, 292 },
293 /*
294 * Use RFO, not WRITEBACK, because a write miss would typically occur
295 * on RFO.
296 */
130 [ C(OP_WRITE) ] = { 297 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 298 /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 299 [ C(RESULT_ACCESS) ] = 0x01bb,
300 /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
301 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 302 },
134 [ C(OP_PREFETCH) ] = { 303 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 304 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 305 [ C(RESULT_ACCESS) ] = 0x01b7,
306 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
307 [ C(RESULT_MISS) ] = 0x01bb,
137 }, 308 },
138 }, 309 },
139 [ C(DTLB) ] = { 310 [ C(DTLB) ] = {
@@ -180,6 +351,39 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 351 },
181}; 352};
182 353
354/*
355 * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
356 */
357
358#define DMND_DATA_RD (1 << 0)
359#define DMND_RFO (1 << 1)
360#define DMND_WB (1 << 3)
361#define PF_DATA_RD (1 << 4)
362#define PF_DATA_RFO (1 << 5)
363#define RESP_UNCORE_HIT (1 << 8)
364#define RESP_MISS (0xf600) /* non uncore hit */
365
366static __initconst const u64 nehalem_hw_cache_extra_regs
367 [PERF_COUNT_HW_CACHE_MAX]
368 [PERF_COUNT_HW_CACHE_OP_MAX]
369 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
370{
371 [ C(LL ) ] = {
372 [ C(OP_READ) ] = {
373 [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
374 [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
375 },
376 [ C(OP_WRITE) ] = {
377 [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
378 [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
379 },
380 [ C(OP_PREFETCH) ] = {
381 [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
382 [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
383 },
384 }
385};
386
183static __initconst const u64 nehalem_hw_cache_event_ids 387static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 388 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 389 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -215,16 +419,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 419 },
216 [ C(LL ) ] = { 420 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 421 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 422 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 423 [ C(RESULT_ACCESS) ] = 0x01b7,
424 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
425 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 426 },
427 /*
428 * Use RFO, not WRITEBACK, because a write miss would typically occur
429 * on RFO.
430 */
221 [ C(OP_WRITE) ] = { 431 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 432 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 433 [ C(RESULT_ACCESS) ] = 0x01b7,
434 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
435 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 436 },
225 [ C(OP_PREFETCH) ] = { 437 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 438 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 439 [ C(RESULT_ACCESS) ] = 0x01b7,
440 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
441 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 442 },
229 }, 443 },
230 [ C(DTLB) ] = { 444 [ C(DTLB) ] = {
@@ -691,8 +905,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 905 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 906
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 907 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 908 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 909 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 910 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 911 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 912 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -794,6 +1008,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1008}
795 1009
796static struct event_constraint * 1010static struct event_constraint *
1011intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1012{
1013 struct hw_perf_event *hwc = &event->hw;
1014 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1015 struct event_constraint *c;
1016 struct intel_percore *pc;
1017 struct er_account *era;
1018 int i;
1019 int free_slot;
1020 int found;
1021
1022 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1023 return NULL;
1024
1025 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1026 if (e != c->code)
1027 continue;
1028
1029 /*
1030 * Allocate resource per core.
1031 */
1032 pc = cpuc->per_core;
1033 if (!pc)
1034 break;
1035 c = &emptyconstraint;
1036 raw_spin_lock(&pc->lock);
1037 free_slot = -1;
1038 found = 0;
1039 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1040 era = &pc->regs[i];
1041 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1042 /* Allow sharing same config */
1043 if (hwc->extra_config == era->extra_config) {
1044 era->ref++;
1045 cpuc->percore_used = 1;
1046 hwc->extra_alloc = 1;
1047 c = NULL;
1048 }
1049 /* else conflict */
1050 found = 1;
1051 break;
1052 } else if (era->ref == 0 && free_slot == -1)
1053 free_slot = i;
1054 }
1055 if (!found && free_slot != -1) {
1056 era = &pc->regs[free_slot];
1057 era->ref = 1;
1058 era->extra_reg = hwc->extra_reg;
1059 era->extra_config = hwc->extra_config;
1060 cpuc->percore_used = 1;
1061 hwc->extra_alloc = 1;
1062 c = NULL;
1063 }
1064 raw_spin_unlock(&pc->lock);
1065 return c;
1066 }
1067
1068 return NULL;
1069}
1070
1071static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1072intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1073{
799 struct event_constraint *c; 1074 struct event_constraint *c;
@@ -806,9 +1081,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1081 if (c)
807 return c; 1082 return c;
808 1083
1084 c = intel_percore_constraints(cpuc, event);
1085 if (c)
1086 return c;
1087
809 return x86_get_event_constraints(cpuc, event); 1088 return x86_get_event_constraints(cpuc, event);
810} 1089}
811 1090
1091static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1092 struct perf_event *event)
1093{
1094 struct extra_reg *er;
1095 struct intel_percore *pc;
1096 struct er_account *era;
1097 struct hw_perf_event *hwc = &event->hw;
1098 int i, allref;
1099
1100 if (!cpuc->percore_used)
1101 return;
1102
1103 for (er = x86_pmu.extra_regs; er->msr; er++) {
1104 if (er->event != (hwc->config & er->config_mask))
1105 continue;
1106
1107 pc = cpuc->per_core;
1108 raw_spin_lock(&pc->lock);
1109 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1110 era = &pc->regs[i];
1111 if (era->ref > 0 &&
1112 era->extra_config == hwc->extra_config &&
1113 era->extra_reg == er->msr) {
1114 era->ref--;
1115 hwc->extra_alloc = 0;
1116 break;
1117 }
1118 }
1119 allref = 0;
1120 for (i = 0; i < MAX_EXTRA_REGS; i++)
1121 allref += pc->regs[i].ref;
1122 if (allref == 0)
1123 cpuc->percore_used = 0;
1124 raw_spin_unlock(&pc->lock);
1125 break;
1126 }
1127}
1128
812static int intel_pmu_hw_config(struct perf_event *event) 1129static int intel_pmu_hw_config(struct perf_event *event)
813{ 1130{
814 int ret = x86_pmu_hw_config(event); 1131 int ret = x86_pmu_hw_config(event);
@@ -880,20 +1197,67 @@ static __initconst const struct x86_pmu core_pmu = {
880 */ 1197 */
881 .max_period = (1ULL << 31) - 1, 1198 .max_period = (1ULL << 31) - 1,
882 .get_event_constraints = intel_get_event_constraints, 1199 .get_event_constraints = intel_get_event_constraints,
1200 .put_event_constraints = intel_put_event_constraints,
883 .event_constraints = intel_core_event_constraints, 1201 .event_constraints = intel_core_event_constraints,
884}; 1202};
885 1203
1204static int intel_pmu_cpu_prepare(int cpu)
1205{
1206 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1207
1208 if (!cpu_has_ht_siblings())
1209 return NOTIFY_OK;
1210
1211 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1212 GFP_KERNEL, cpu_to_node(cpu));
1213 if (!cpuc->per_core)
1214 return NOTIFY_BAD;
1215
1216 raw_spin_lock_init(&cpuc->per_core->lock);
1217 cpuc->per_core->core_id = -1;
1218 return NOTIFY_OK;
1219}
1220
886static void intel_pmu_cpu_starting(int cpu) 1221static void intel_pmu_cpu_starting(int cpu)
887{ 1222{
1223 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1224 int core_id = topology_core_id(cpu);
1225 int i;
1226
888 init_debug_store_on_cpu(cpu); 1227 init_debug_store_on_cpu(cpu);
889 /* 1228 /*
890 * Deal with CPUs that don't clear their LBRs on power-up. 1229 * Deal with CPUs that don't clear their LBRs on power-up.
891 */ 1230 */
892 intel_pmu_lbr_reset(); 1231 intel_pmu_lbr_reset();
1232
1233 if (!cpu_has_ht_siblings())
1234 return;
1235
1236 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1237 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1238
1239 if (pc && pc->core_id == core_id) {
1240 kfree(cpuc->per_core);
1241 cpuc->per_core = pc;
1242 break;
1243 }
1244 }
1245
1246 cpuc->per_core->core_id = core_id;
1247 cpuc->per_core->refcnt++;
893} 1248}
894 1249
895static void intel_pmu_cpu_dying(int cpu) 1250static void intel_pmu_cpu_dying(int cpu)
896{ 1251{
1252 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1253 struct intel_percore *pc = cpuc->per_core;
1254
1255 if (pc) {
1256 if (pc->core_id == -1 || --pc->refcnt == 0)
1257 kfree(pc);
1258 cpuc->per_core = NULL;
1259 }
1260
897 fini_debug_store_on_cpu(cpu); 1261 fini_debug_store_on_cpu(cpu);
898} 1262}
899 1263
@@ -918,7 +1282,9 @@ static __initconst const struct x86_pmu intel_pmu = {
918 */ 1282 */
919 .max_period = (1ULL << 31) - 1, 1283 .max_period = (1ULL << 31) - 1,
920 .get_event_constraints = intel_get_event_constraints, 1284 .get_event_constraints = intel_get_event_constraints,
1285 .put_event_constraints = intel_put_event_constraints,
921 1286
1287 .cpu_prepare = intel_pmu_cpu_prepare,
922 .cpu_starting = intel_pmu_cpu_starting, 1288 .cpu_starting = intel_pmu_cpu_starting,
923 .cpu_dying = intel_pmu_cpu_dying, 1289 .cpu_dying = intel_pmu_cpu_dying,
924}; 1290};
@@ -1024,6 +1390,7 @@ static __init int intel_pmu_init(void)
1024 intel_pmu_lbr_init_core(); 1390 intel_pmu_lbr_init_core();
1025 1391
1026 x86_pmu.event_constraints = intel_core2_event_constraints; 1392 x86_pmu.event_constraints = intel_core2_event_constraints;
1393 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1027 pr_cont("Core2 events, "); 1394 pr_cont("Core2 events, ");
1028 break; 1395 break;
1029 1396
@@ -1032,11 +1399,16 @@ static __init int intel_pmu_init(void)
1032 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1399 case 46: /* 45 nm nehalem-ex, "Beckton" */
1033 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1400 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1034 sizeof(hw_cache_event_ids)); 1401 sizeof(hw_cache_event_ids));
1402 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1403 sizeof(hw_cache_extra_regs));
1035 1404
1036 intel_pmu_lbr_init_nhm(); 1405 intel_pmu_lbr_init_nhm();
1037 1406
1038 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1407 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1408 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1409 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1039 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1410 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1411 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1040 pr_cont("Nehalem events, "); 1412 pr_cont("Nehalem events, ");
1041 break; 1413 break;
1042 1414
@@ -1047,6 +1419,7 @@ static __init int intel_pmu_init(void)
1047 intel_pmu_lbr_init_atom(); 1419 intel_pmu_lbr_init_atom();
1048 1420
1049 x86_pmu.event_constraints = intel_gen_event_constraints; 1421 x86_pmu.event_constraints = intel_gen_event_constraints;
1422 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1050 pr_cont("Atom events, "); 1423 pr_cont("Atom events, ");
1051 break; 1424 break;
1052 1425
@@ -1054,14 +1427,30 @@ static __init int intel_pmu_init(void)
1054 case 44: /* 32 nm nehalem, "Gulftown" */ 1427 case 44: /* 32 nm nehalem, "Gulftown" */
1055 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1428 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1056 sizeof(hw_cache_event_ids)); 1429 sizeof(hw_cache_event_ids));
1430 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1431 sizeof(hw_cache_extra_regs));
1057 1432
1058 intel_pmu_lbr_init_nhm(); 1433 intel_pmu_lbr_init_nhm();
1059 1434
1060 x86_pmu.event_constraints = intel_westmere_event_constraints; 1435 x86_pmu.event_constraints = intel_westmere_event_constraints;
1436 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1061 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1437 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1438 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1439 x86_pmu.extra_regs = intel_westmere_extra_regs;
1062 pr_cont("Westmere events, "); 1440 pr_cont("Westmere events, ");
1063 break; 1441 break;
1064 1442
1443 case 42: /* SandyBridge */
1444 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 intel_pmu_lbr_init_nhm();
1448
1449 x86_pmu.event_constraints = intel_snb_event_constraints;
1450 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1451 pr_cont("SandyBridge events, ");
1452 break;
1453
1065 default: 1454 default:
1066 /* 1455 /*
1067 * default constraints for v2 and up 1456 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f2b8a0..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -361,30 +361,70 @@ static int intel_pmu_drain_bts_buffer(void)
361/* 361/*
362 * PEBS 362 * PEBS
363 */ 363 */
364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
368 INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
370 EVENT_CONSTRAINT_END
371};
372
373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
377 EVENT_CONSTRAINT_END
378};
364 379
365static struct event_constraint intel_core_pebs_events[] = { 380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
366 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ 381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
367 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
368 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
369 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
370 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
371 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
372 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 387 INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
373 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
374 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 389 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
375 EVENT_CONSTRAINT_END 392 EVENT_CONSTRAINT_END
376}; 393};
377 394
378static struct event_constraint intel_nehalem_pebs_events[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
379 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
380 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
381 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
382 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
383 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
384 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 401 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
385 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 402 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
386 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 403 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
387 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 404 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
405 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
406 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
407 EVENT_CONSTRAINT_END
408};
409
410static struct event_constraint intel_snb_pebs_events[] = {
411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
388 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
389}; 429};
390 430
@@ -695,20 +735,17 @@ static void intel_ds_init(void)
695 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 735 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
696 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 736 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
697 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 737 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
698 x86_pmu.pebs_constraints = intel_core_pebs_events;
699 break; 738 break;
700 739
701 case 1: 740 case 1:
702 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 741 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
703 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 742 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
704 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 743 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
705 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
706 break; 744 break;
707 745
708 default: 746 default:
709 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 747 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
710 x86_pmu.pebs = 0; 748 x86_pmu.pebs = 0;
711 break;
712 } 749 }
713 } 750 }
714} 751}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ff751a9f182b..0811f5ebfba6 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -679,7 +679,7 @@ static int p4_validate_raw_event(struct perf_event *event)
679 */ 679 */
680 680
681 /* 681 /*
682 * if an event is shared accross the logical threads 682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it 683 * the user needs special permissions to be able to use it
684 */ 684 */
685 if (p4_ht_active() && p4_event_bind_map[v].shared) { 685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
@@ -764,9 +764,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
764 u64 v; 764 u64 v;
765 765
766 /* an official way for overflow indication */ 766 /* an official way for overflow indication */
767 rdmsrl(hwc->config_base + hwc->idx, v); 767 rdmsrl(hwc->config_base, v);
768 if (v & P4_CCCR_OVF) { 768 if (v & P4_CCCR_OVF) {
769 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
770 return 1; 770 return 1;
771 } 771 }
772 772
@@ -790,13 +790,13 @@ static void p4_pmu_disable_pebs(void)
790 * 790 *
791 * It's still allowed that two threads setup same cache 791 * It's still allowed that two threads setup same cache
792 * events so we can't simply clear metrics until we knew 792 * events so we can't simply clear metrics until we knew
793 * noone is depending on us, so we need kind of counter 793 * no one is depending on us, so we need kind of counter
794 * for "ReplayEvent" users. 794 * for "ReplayEvent" users.
795 * 795 *
796 * What is more complex -- RAW events, if user (for some 796 * What is more complex -- RAW events, if user (for some
797 * reason) will pass some cache event metric with improper 797 * reason) will pass some cache event metric with improper
798 * event opcode -- it's fine from hardware point of view 798 * event opcode -- it's fine from hardware point of view
799 * but completely nonsence from "meaning" of such action. 799 * but completely nonsense from "meaning" of such action.
800 * 800 *
801 * So at moment let leave metrics turned on forever -- it's 801 * So at moment let leave metrics turned on forever -- it's
802 * ok for now but need to be revisited! 802 * ok for now but need to be revisited!
@@ -815,7 +815,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
815 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 815 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
816 * asserted again and again 816 * asserted again and again
817 */ 817 */
818 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 818 (void)checking_wrmsrl(hwc->config_base,
819 (u64)(p4_config_unpack_cccr(hwc->config)) & 819 (u64)(p4_config_unpack_cccr(hwc->config)) &
820 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 820 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
821} 821}
@@ -885,7 +885,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
885 p4_pmu_enable_pebs(hwc->config); 885 p4_pmu_enable_pebs(hwc->config);
886 886
887 (void)checking_wrmsrl(escr_addr, escr_conf); 887 (void)checking_wrmsrl(escr_addr, escr_conf);
888 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 888 (void)checking_wrmsrl(hwc->config_base,
889 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 889 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
890} 890}
891 891
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d5a236615501..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
46 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
47 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
48 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
49 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
50 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
51 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
70 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
71 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
72 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
73 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
74 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
75 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..7a8cebc9ff29
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,441 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16
17#include <asm/hpet.h>
18#include <asm/irq_controller.h>
19#include <asm/apic.h>
20#include <asm/pci_x86.h>
21
22__initdata u64 initial_dtb;
23char __initdata cmd_line[COMMAND_LINE_SIZE];
24static LIST_HEAD(irq_domains);
25static DEFINE_RAW_SPINLOCK(big_irq_lock);
26
27int __initdata of_ioapic;
28
29#ifdef CONFIG_X86_IO_APIC
30static void add_interrupt_host(struct irq_domain *ih)
31{
32 unsigned long flags;
33
34 raw_spin_lock_irqsave(&big_irq_lock, flags);
35 list_add(&ih->l, &irq_domains);
36 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
37}
38#endif
39
40static struct irq_domain *get_ih_from_node(struct device_node *controller)
41{
42 struct irq_domain *ih, *found = NULL;
43 unsigned long flags;
44
45 raw_spin_lock_irqsave(&big_irq_lock, flags);
46 list_for_each_entry(ih, &irq_domains, l) {
47 if (ih->controller == controller) {
48 found = ih;
49 break;
50 }
51 }
52 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
53 return found;
54}
55
56unsigned int irq_create_of_mapping(struct device_node *controller,
57 const u32 *intspec, unsigned int intsize)
58{
59 struct irq_domain *ih;
60 u32 virq, type;
61 int ret;
62
63 ih = get_ih_from_node(controller);
64 if (!ih)
65 return 0;
66 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
67 if (ret)
68 return ret;
69 if (type == IRQ_TYPE_NONE)
70 return virq;
71 /* set the mask if it is different from current */
72 if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
73 set_irq_type(virq, type);
74 return virq;
75}
76EXPORT_SYMBOL_GPL(irq_create_of_mapping);
77
78unsigned long pci_address_to_pio(phys_addr_t address)
79{
80 /*
81 * The ioport address can be directly used by inX / outX
82 */
83 BUG_ON(address >= (1 << 16));
84 return (unsigned long)address;
85}
86EXPORT_SYMBOL_GPL(pci_address_to_pio);
87
88void __init early_init_dt_scan_chosen_arch(unsigned long node)
89{
90 BUG();
91}
92
93void __init early_init_dt_add_memory_arch(u64 base, u64 size)
94{
95 BUG();
96}
97
98void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
99{
100 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
101}
102
103void __init add_dtb(u64 data)
104{
105 initial_dtb = data + offsetof(struct setup_data, data);
106}
107
108/*
109 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
110 */
111static struct of_device_id __initdata ce4100_ids[] = {
112 { .compatible = "intel,ce4100-cp", },
113 { .compatible = "isa", },
114 { .compatible = "pci", },
115 {},
116};
117
118static int __init add_bus_probe(void)
119{
120 if (!of_have_populated_dt())
121 return 0;
122
123 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
124}
125module_init(add_bus_probe);
126
127#ifdef CONFIG_PCI
128static int x86_of_pci_irq_enable(struct pci_dev *dev)
129{
130 struct of_irq oirq;
131 u32 virq;
132 int ret;
133 u8 pin;
134
135 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
136 if (ret)
137 return ret;
138 if (!pin)
139 return 0;
140
141 ret = of_irq_map_pci(dev, &oirq);
142 if (ret)
143 return ret;
144
145 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
146 oirq.size);
147 if (virq == 0)
148 return -EINVAL;
149 dev->irq = virq;
150 return 0;
151}
152
153static void x86_of_pci_irq_disable(struct pci_dev *dev)
154{
155}
156
157void __cpuinit x86_of_pci_init(void)
158{
159 struct device_node *np;
160
161 pcibios_enable_irq = x86_of_pci_irq_enable;
162 pcibios_disable_irq = x86_of_pci_irq_disable;
163
164 for_each_node_by_type(np, "pci") {
165 const void *prop;
166 struct pci_bus *bus;
167 unsigned int bus_min;
168 struct device_node *child;
169
170 prop = of_get_property(np, "bus-range", NULL);
171 if (!prop)
172 continue;
173 bus_min = be32_to_cpup(prop);
174
175 bus = pci_find_bus(0, bus_min);
176 if (!bus) {
177 printk(KERN_ERR "Can't find a node for bus %s.\n",
178 np->full_name);
179 continue;
180 }
181
182 if (bus->self)
183 bus->self->dev.of_node = np;
184 else
185 bus->dev.of_node = np;
186
187 for_each_child_of_node(np, child) {
188 struct pci_dev *dev;
189 u32 devfn;
190
191 prop = of_get_property(child, "reg", NULL);
192 if (!prop)
193 continue;
194
195 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
196 dev = pci_get_slot(bus, devfn);
197 if (!dev)
198 continue;
199 dev->dev.of_node = child;
200 pci_dev_put(dev);
201 }
202 }
203}
204#endif
205
206static void __init dtb_setup_hpet(void)
207{
208#ifdef CONFIG_HPET_TIMER
209 struct device_node *dn;
210 struct resource r;
211 int ret;
212
213 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
214 if (!dn)
215 return;
216 ret = of_address_to_resource(dn, 0, &r);
217 if (ret) {
218 WARN_ON(1);
219 return;
220 }
221 hpet_address = r.start;
222#endif
223}
224
225static void __init dtb_lapic_setup(void)
226{
227#ifdef CONFIG_X86_LOCAL_APIC
228 struct device_node *dn;
229 struct resource r;
230 int ret;
231
232 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
233 if (!dn)
234 return;
235
236 ret = of_address_to_resource(dn, 0, &r);
237 if (WARN_ON(ret))
238 return;
239
240 /* Did the boot loader setup the local APIC ? */
241 if (!cpu_has_apic) {
242 if (apic_force_enable(r.start))
243 return;
244 }
245 smp_found_config = 1;
246 pic_mode = 1;
247 register_lapic_address(r.start);
248 generic_processor_info(boot_cpu_physical_apicid,
249 GET_APIC_VERSION(apic_read(APIC_LVR)));
250#endif
251}
252
253#ifdef CONFIG_X86_IO_APIC
254static unsigned int ioapic_id;
255
256static void __init dtb_add_ioapic(struct device_node *dn)
257{
258 struct resource r;
259 int ret;
260
261 ret = of_address_to_resource(dn, 0, &r);
262 if (ret) {
263 printk(KERN_ERR "Can't obtain address from node %s.\n",
264 dn->full_name);
265 return;
266 }
267 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
268}
269
270static void __init dtb_ioapic_setup(void)
271{
272 struct device_node *dn;
273
274 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
275 dtb_add_ioapic(dn);
276
277 if (nr_ioapics) {
278 of_ioapic = 1;
279 return;
280 }
281 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
282}
283#else
284static void __init dtb_ioapic_setup(void) {}
285#endif
286
287static void __init dtb_apic_setup(void)
288{
289 dtb_lapic_setup();
290 dtb_ioapic_setup();
291}
292
293#ifdef CONFIG_OF_FLATTREE
294static void __init x86_flattree_get_config(void)
295{
296 u32 size, map_len;
297 void *new_dtb;
298
299 if (!initial_dtb)
300 return;
301
302 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
303 (u64)sizeof(struct boot_param_header));
304
305 initial_boot_params = early_memremap(initial_dtb, map_len);
306 size = be32_to_cpu(initial_boot_params->totalsize);
307 if (map_len < size) {
308 early_iounmap(initial_boot_params, map_len);
309 initial_boot_params = early_memremap(initial_dtb, size);
310 map_len = size;
311 }
312
313 new_dtb = alloc_bootmem(size);
314 memcpy(new_dtb, initial_boot_params, size);
315 early_iounmap(initial_boot_params, map_len);
316
317 initial_boot_params = new_dtb;
318
319 /* root level address cells */
320 of_scan_flat_dt(early_init_dt_scan_root, NULL);
321
322 unflatten_device_tree();
323}
324#else
325static inline void x86_flattree_get_config(void) { }
326#endif
327
328void __init x86_dtb_init(void)
329{
330 x86_flattree_get_config();
331
332 if (!of_have_populated_dt())
333 return;
334
335 dtb_setup_hpet();
336 dtb_apic_setup();
337}
338
339#ifdef CONFIG_X86_IO_APIC
340
341struct of_ioapic_type {
342 u32 out_type;
343 u32 trigger;
344 u32 polarity;
345};
346
347static struct of_ioapic_type of_ioapic_type[] =
348{
349 {
350 .out_type = IRQ_TYPE_EDGE_RISING,
351 .trigger = IOAPIC_EDGE,
352 .polarity = 1,
353 },
354 {
355 .out_type = IRQ_TYPE_LEVEL_LOW,
356 .trigger = IOAPIC_LEVEL,
357 .polarity = 0,
358 },
359 {
360 .out_type = IRQ_TYPE_LEVEL_HIGH,
361 .trigger = IOAPIC_LEVEL,
362 .polarity = 1,
363 },
364 {
365 .out_type = IRQ_TYPE_EDGE_FALLING,
366 .trigger = IOAPIC_EDGE,
367 .polarity = 0,
368 },
369};
370
371static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
372 u32 *out_hwirq, u32 *out_type)
373{
374 struct io_apic_irq_attr attr;
375 struct of_ioapic_type *it;
376 u32 line, idx, type;
377
378 if (intsize < 2)
379 return -EINVAL;
380
381 line = *intspec;
382 idx = (u32) id->priv;
383 *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
384
385 intspec++;
386 type = *intspec;
387
388 if (type >= ARRAY_SIZE(of_ioapic_type))
389 return -EINVAL;
390
391 it = of_ioapic_type + type;
392 *out_type = it->out_type;
393
394 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
395
396 return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
397}
398
399static void __init ioapic_add_ofnode(struct device_node *np)
400{
401 struct resource r;
402 int i, ret;
403
404 ret = of_address_to_resource(np, 0, &r);
405 if (ret) {
406 printk(KERN_ERR "Failed to obtain address for %s\n",
407 np->full_name);
408 return;
409 }
410
411 for (i = 0; i < nr_ioapics; i++) {
412 if (r.start == mp_ioapics[i].apicaddr) {
413 struct irq_domain *id;
414
415 id = kzalloc(sizeof(*id), GFP_KERNEL);
416 BUG_ON(!id);
417 id->controller = np;
418 id->xlate = ioapic_xlate;
419 id->priv = (void *)i;
420 add_interrupt_host(id);
421 return;
422 }
423 }
424 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
425}
426
427void __init x86_add_irq_domains(void)
428{
429 struct device_node *dp;
430
431 if (!of_have_populated_dt())
432 return;
433
434 for_each_node_with_property(dp, "interrupt-controller") {
435 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
436 ioapic_add_ofnode(dp);
437 }
438}
439#else
440void __init x86_add_irq_domains(void) { }
441#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1b..81ac6c78c01c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, char *log_lvl) 178 unsigned long *stack, unsigned long bp, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack) 185 unsigned long *stack, unsigned long bp)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, ""); 187 show_trace_log_lvl(task, regs, stack, bp, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, ""); 192 show_stack_log_lvl(task, NULL, sp, 0, "");
193} 193}
194 194
195/* 195/*
@@ -197,14 +197,16 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp;
200 unsigned long stack; 201 unsigned long stack;
201 202
203 bp = stack_frame(current, NULL);
202 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
203 current->pid, current->comm, print_tainted(), 205 current->pid, current->comm, print_tainted(),
204 init_utsname()->release, 206 init_utsname()->release,
205 (int)strcspn(init_utsname()->version, " "), 207 (int)strcspn(init_utsname()->version, " "),
206 init_utsname()->version); 208 init_utsname()->version);
207 show_trace(NULL, NULL, &stack); 209 show_trace(NULL, NULL, &stack, bp);
208} 210}
209EXPORT_SYMBOL(dump_stack); 211EXPORT_SYMBOL(dump_stack);
210 212
@@ -320,41 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err)
320 oops_end(flags, regs, sig); 322 oops_end(flags, regs, sig);
321} 323}
322 324
323void notrace __kprobes
324die_nmi(char *str, struct pt_regs *regs, int do_panic)
325{
326 unsigned long flags;
327
328 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
329 return;
330
331 /*
332 * We are in trouble anyway, lets at least try
333 * to get a message out.
334 */
335 flags = oops_begin();
336 printk(KERN_EMERG "%s", str);
337 printk(" on CPU%d, ip %08lx, registers:\n",
338 smp_processor_id(), regs->ip);
339 show_registers(regs);
340 oops_end(flags, regs, 0);
341 if (do_panic || panic_on_oops)
342 panic("Non maskable interrupt");
343 nmi_exit();
344 local_irq_enable();
345 do_exit(SIGBUS);
346}
347
348static int __init oops_setup(char *s)
349{
350 if (!s)
351 return -EINVAL;
352 if (!strcmp(s, "panic"))
353 panic_on_oops = 1;
354 return 0;
355}
356early_param("oops", oops_setup);
357
358static int __init kstack_setup(char *s) 325static int __init kstack_setup(char *s)
359{ 326{
360 if (!s) 327 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 74cc1eda384b..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,12 +17,11 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
21 struct pt_regs *regs, unsigned long *stack, 21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
26 25
27 if (!task) 26 if (!task)
28 task = current; 27 task = current;
@@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task,
35 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
36 } 35 }
37 36
38 bp = stack_frame(task, regs); 37 if (!bp)
38 bp = stack_frame(task, regs);
39
39 for (;;) { 40 for (;;) {
40 struct thread_info *context; 41 struct thread_info *context;
41 42
@@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace);
55 56
56void 57void
57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 58show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
58 unsigned long *sp, char *log_lvl) 59 unsigned long *sp, unsigned long bp, char *log_lvl)
59{ 60{
60 unsigned long *stack; 61 unsigned long *stack;
61 int i; 62 int i;
@@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
77 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
78 } 79 }
79 printk(KERN_CONT "\n"); 80 printk(KERN_CONT "\n");
80 show_trace_log_lvl(task, regs, sp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
81} 82}
82 83
83 84
@@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs)
102 u8 *ip; 103 u8 *ip;
103 104
104 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG); 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
106 107
107 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
108 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a6b6fcf7f0ae..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, 142void dump_trace(struct task_struct *task, struct pt_regs *regs,
143 struct pt_regs *regs, unsigned long *stack, 143 unsigned long *stack, unsigned long bp,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -150,7 +150,6 @@ void dump_trace(struct task_struct *task,
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy; 152 unsigned long dummy;
153 unsigned long bp;
154 153
155 if (!task) 154 if (!task)
156 task = current; 155 task = current;
@@ -161,7 +160,8 @@ void dump_trace(struct task_struct *task,
161 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
162 } 161 }
163 162
164 bp = stack_frame(task, regs); 163 if (!bp)
164 bp = stack_frame(task, regs);
165 /* 165 /*
166 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
167 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
225 225
226void 226void
227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 unsigned long *sp, char *log_lvl) 228 unsigned long *sp, unsigned long bp, char *log_lvl)
229{ 229{
230 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
231 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
269 preempt_enable(); 269 preempt_enable();
270 270
271 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
272 show_trace_log_lvl(task, regs, sp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
273} 273}
274 274
275void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs)
298 298
299 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
301 KERN_EMERG); 301 0, KERN_EMERG);
302 302
303 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
304 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26da0c0c..cdf5bfd9d4d5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -667,21 +667,15 @@ __init void e820_setup_gap(void)
667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
668 * linked list of struct setup_data, which is parsed here. 668 * linked list of struct setup_data, which is parsed here.
669 */ 669 */
670void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 670void __init parse_e820_ext(struct setup_data *sdata)
671{ 671{
672 u32 map_len;
673 int entries; 672 int entries;
674 struct e820entry *extmap; 673 struct e820entry *extmap;
675 674
676 entries = sdata->len / sizeof(struct e820entry); 675 entries = sdata->len / sizeof(struct e820entry);
677 map_len = sdata->len + sizeof(struct setup_data);
678 if (map_len > PAGE_SIZE)
679 sdata = early_ioremap(pa_data, map_len);
680 extmap = (struct e820entry *)(sdata->data); 676 extmap = (struct e820entry *)(sdata->data);
681 __append_e820_map(extmap, entries); 677 __append_e820_map(extmap, entries);
682 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 678 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
683 if (map_len > PAGE_SIZE)
684 early_iounmap(sdata, map_len);
685 printk(KERN_INFO "extended physical RAM map:\n"); 679 printk(KERN_INFO "extended physical RAM map:\n");
686 e820_print_map("extended"); 680 e820_print_map("extended");
687} 681}
@@ -847,15 +841,21 @@ static int __init parse_memopt(char *p)
847 if (!p) 841 if (!p)
848 return -EINVAL; 842 return -EINVAL;
849 843
850#ifdef CONFIG_X86_32
851 if (!strcmp(p, "nopentium")) { 844 if (!strcmp(p, "nopentium")) {
845#ifdef CONFIG_X86_32
852 setup_clear_cpu_cap(X86_FEATURE_PSE); 846 setup_clear_cpu_cap(X86_FEATURE_PSE);
853 return 0; 847 return 0;
854 } 848#else
849 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
850 return -EINVAL;
855#endif 851#endif
852 }
856 853
857 userdef = 1; 854 userdef = 1;
858 mem_size = memparse(p, &p); 855 mem_size = memparse(p, &p);
856 /* don't remove all of memory when handling "mem={invalid}" param */
857 if (mem_size == 0)
858 return -EINVAL;
859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
860 860
861 return 0; 861 return 0;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9efbdcc56425..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -159,7 +159,12 @@ static void __init ati_bugs_contd(int num, int slot, int func)
159 if (rev >= 0x40) 159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1; 160 acpi_fix_pin2_polarity = 1;
161 161
162 if (rev > 0x13) 162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
163 return; 168 return;
164 169
165 if (acpi_use_timer_override) 170 if (acpi_use_timer_override)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efad7ebb..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -395,7 +397,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 399 */
398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
400 402
401 pushl_cfi %eax 403 pushl_cfi %eax
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
788 */ 790 */
789.section .init.rodata,"a" 791.section .init.rodata,"a"
790ENTRY(interrupt) 792ENTRY(interrupt)
791.text 793.section .entry.text, "ax"
792 .p2align 5 794 .p2align 5
793 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
794ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
807 .endif 809 .endif
808 .previous 810 .previous
809 .long 1b 811 .long 1b
810 .text 812 .section .entry.text, "ax"
811vector=vector+1 813vector=vector+1
812 .endif 814 .endif
813 .endr 815 .endr
@@ -1409,11 +1411,10 @@ END(general_protection)
1409#ifdef CONFIG_KVM_GUEST 1411#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault) 1412ENTRY(async_page_fault)
1411 RING0_EC_FRAME 1413 RING0_EC_FRAME
1412 pushl $do_async_page_fault 1414 pushl_cfi $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code 1415 jmp error_code
1415 CFI_ENDPROC 1416 CFI_ENDPROC
1416END(apf_page_fault) 1417END(async_page_fault)
1417#endif 1418#endif
1418 1419
1419/* 1420/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffbeb0c9..8a445a0c989e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -420,7 +422,7 @@ ENTRY(ret_from_fork)
420END(ret_from_fork) 422END(ret_from_fork)
421 423
422/* 424/*
423 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
424 * 426 *
425 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
426 * stack pointer. 428 * stack pointer.
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
744 */ 746 */
745 .section .init.rodata,"a" 747 .section .init.rodata,"a"
746ENTRY(interrupt) 748ENTRY(interrupt)
747 .text 749 .section .entry.text
748 .p2align 5 750 .p2align 5
749 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
750ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
763 .endif 765 .endif
764 .previous 766 .previous
765 .quad 1b 767 .quad 1b
766 .text 768 .section .entry.text
767vector=vector+1 769vector=vector+1
768 .endif 770 .endif
769 .endr 771 .endr
@@ -975,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
975 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
976 978
977#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
978.irpc idx, "01234567" 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
982.if NUM_INVALIDATE_TLB_VECTORS > \idx
979apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
980 invalidate_interrupt\idx smp_invalidate_interrupt 984 invalidate_interrupt\idx smp_invalidate_interrupt
985.endif
981.endr 986.endr
982#endif 987#endif
983 988
@@ -1248,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1248 decl PER_CPU_VAR(irq_count) 1253 decl PER_CPU_VAR(irq_count)
1249 jmp error_exit 1254 jmp error_exit
1250 CFI_ENDPROC 1255 CFI_ENDPROC
1251END(do_hypervisor_callback) 1256END(xen_do_hypervisor_callback)
1252 1257
1253/* 1258/*
1254 * Hypervisor uses this for application faults while it executes. 1259 * Hypervisor uses this for application faults while it executes.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 382eb2936d4d..a93742a57468 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
437 return; 437 return;
438 } 438 }
439 439
440 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
441 frame_pointer) == -EBUSY) {
442 *parent = old;
443 return;
444 }
445
446 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
447 442
448 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
449 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
450 current->curr_ret_stack--;
451 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
452 } 453 }
453} 454}
454#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7f138b3c3c52..d6d6bb361931 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,15 +34,6 @@ void __init i386_start_kernel(void)
34{ 34{
35 memblock_init(); 35 memblock_init();
36 36
37#ifdef CONFIG_X86_TRAMPOLINE
38 /*
39 * But first pinch a few for the stack/trampoline stuff
40 * FIXME: Don't need the extra page at 4K, but need to fix
41 * trampoline before removing it. (see the GDT stuff)
42 */
43 memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
44#endif
45
46 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 37 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
47 38
48#ifdef CONFIG_BLK_DEV_INITRD 39#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d2673c28aff..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
77 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
78 zap_identity_mappings(); 78 zap_identity_mappings();
79 79
80 /* Cleanup the over mapped high alias */
81 cleanup_highmap();
82
83 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
84 81
85 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 767d6c43de37..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
73 */ 73 */
74KERNEL_PAGES = LOWMEM_PAGES 74KERNEL_PAGES = LOWMEM_PAGES
75 75
76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
77RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
78 78
79/* 79/*
@@ -137,7 +137,7 @@ ENTRY(startup_32)
137 movsl 137 movsl
1381: 1381:
139 139
140#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
141 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
142 movl %cr3, %eax 142 movl %cr3, %eax
143 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
@@ -623,7 +623,7 @@ ENTRY(initial_code)
623 * BSS section 623 * BSS section
624 */ 624 */
625__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
626 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
627#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
628initial_pg_pmd: 628initial_pg_pmd:
629 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
@@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir)
644#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
645__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
648ENTRY(initial_page_table) 648ENTRY(initial_page_table)
649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 650# if KPMDS == 3
@@ -662,7 +662,7 @@ ENTRY(initial_page_table)
662# else 662# else
663# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
664# endif 664# endif
665 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
666#endif 666#endif
667 667
668.data 668.data
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968f12d2..bfe8f729e086 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
503 if (!irq) 503 if (!irq)
504 return -EINVAL; 504 return -EINVAL;
505 505
506 set_irq_data(irq, dev); 506 irq_set_handler_data(irq, dev);
507 507
508 if (hpet_setup_msi_irq(irq)) 508 if (hpet_setup_msi_irq(irq))
509 return -EINVAL; 509 return -EINVAL;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e60c38cc0eed..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 20757cb2efa3..d9ca749c123b 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq)
112{ 112{
113 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
114 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
115 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
116 i8259A_chip.name); 116 i8259A_chip.name);
117 enable_irq(irq); 117 enable_irq(irq);
118} 118}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 387b6a0c9e81..948a31eae75f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -44,9 +44,9 @@ void ack_bad_irq(unsigned int irq)
44 44
45#define irq_stats(x) (&per_cpu(irq_stat, x)) 45#define irq_stats(x) (&per_cpu(irq_stat, x))
46/* 46/*
47 * /proc/interrupts printing: 47 * /proc/interrupts printing for arch specific interrupts
48 */ 48 */
49static int show_other_interrupts(struct seq_file *p, int prec) 49int arch_show_interrupts(struct seq_file *p, int prec)
50{ 50{
51 int j; 51 int j;
52 52
@@ -122,59 +122,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
122 return 0; 122 return 0;
123} 123}
124 124
125int show_interrupts(struct seq_file *p, void *v)
126{
127 unsigned long flags, any_count = 0;
128 int i = *(loff_t *) v, j, prec;
129 struct irqaction *action;
130 struct irq_desc *desc;
131
132 if (i > nr_irqs)
133 return 0;
134
135 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
136 j *= 10;
137
138 if (i == nr_irqs)
139 return show_other_interrupts(p, prec);
140
141 /* print header */
142 if (i == 0) {
143 seq_printf(p, "%*s", prec + 8, "");
144 for_each_online_cpu(j)
145 seq_printf(p, "CPU%-8d", j);
146 seq_putc(p, '\n');
147 }
148
149 desc = irq_to_desc(i);
150 if (!desc)
151 return 0;
152
153 raw_spin_lock_irqsave(&desc->lock, flags);
154 for_each_online_cpu(j)
155 any_count |= kstat_irqs_cpu(i, j);
156 action = desc->action;
157 if (!action && !any_count)
158 goto out;
159
160 seq_printf(p, "%*d: ", prec, i);
161 for_each_online_cpu(j)
162 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
163 seq_printf(p, " %8s", desc->irq_data.chip->name);
164 seq_printf(p, "-%-8s", desc->name);
165
166 if (action) {
167 seq_printf(p, " %s", action->name);
168 while ((action = action->next) != NULL)
169 seq_printf(p, ", %s", action->name);
170 }
171
172 seq_putc(p, '\n');
173out:
174 raw_spin_unlock_irqrestore(&desc->lock, flags);
175 return 0;
176}
177
178/* 125/*
179 * /proc/stat helpers 126 * /proc/stat helpers
180 */ 127 */
@@ -276,15 +223,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
276 223
277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 224EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
278 225
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
288#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 227/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
290void fixup_irqs(void) 228void fixup_irqs(void)
@@ -293,6 +231,7 @@ void fixup_irqs(void)
293 static int warned; 231 static int warned;
294 struct irq_desc *desc; 232 struct irq_desc *desc;
295 struct irq_data *data; 233 struct irq_data *data;
234 struct irq_chip *chip;
296 235
297 for_each_irq_desc(irq, desc) { 236 for_each_irq_desc(irq, desc) {
298 int break_affinity = 0; 237 int break_affinity = 0;
@@ -307,10 +246,10 @@ void fixup_irqs(void)
307 /* interrupt's are disabled at this point */ 246 /* interrupt's are disabled at this point */
308 raw_spin_lock(&desc->lock); 247 raw_spin_lock(&desc->lock);
309 248
310 data = &desc->irq_data; 249 data = irq_desc_get_irq_data(desc);
311 affinity = data->affinity; 250 affinity = data->affinity;
312 if (!irq_has_action(irq) || 251 if (!irq_has_action(irq) ||
313 cpumask_equal(affinity, cpu_online_mask)) { 252 cpumask_subset(affinity, cpu_online_mask)) {
314 raw_spin_unlock(&desc->lock); 253 raw_spin_unlock(&desc->lock);
315 continue; 254 continue;
316 } 255 }
@@ -327,16 +266,17 @@ void fixup_irqs(void)
327 affinity = cpu_all_mask; 266 affinity = cpu_all_mask;
328 } 267 }
329 268
330 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) 269 chip = irq_data_get_irq_chip(data);
331 data->chip->irq_mask(data); 270 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
271 chip->irq_mask(data);
332 272
333 if (data->chip->irq_set_affinity) 273 if (chip->irq_set_affinity)
334 data->chip->irq_set_affinity(data, affinity, true); 274 chip->irq_set_affinity(data, affinity, true);
335 else if (!(warned++)) 275 else if (!(warned++))
336 set_affinity = 0; 276 set_affinity = 0;
337 277
338 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) 278 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
339 data->chip->irq_unmask(data); 279 chip->irq_unmask(data);
340 280
341 raw_spin_unlock(&desc->lock); 281 raw_spin_unlock(&desc->lock);
342 282
@@ -368,10 +308,11 @@ void fixup_irqs(void)
368 irq = __this_cpu_read(vector_irq[vector]); 308 irq = __this_cpu_read(vector_irq[vector]);
369 309
370 desc = irq_to_desc(irq); 310 desc = irq_to_desc(irq);
371 data = &desc->irq_data; 311 data = irq_desc_get_irq_data(desc);
312 chip = irq_data_get_irq_chip(data);
372 raw_spin_lock(&desc->lock); 313 raw_spin_lock(&desc->lock);
373 if (data->chip->irq_retrigger) 314 if (chip->irq_retrigger)
374 data->chip->irq_retrigger(data); 315 chip->irq_retrigger(data);
375 raw_spin_unlock(&desc->lock); 316 raw_spin_unlock(&desc->lock);
376 } 317 }
377 } 318 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9974d21048fd..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -172,7 +172,7 @@ asmlinkage void do_softirq(void)
172 172
173 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
174 /* 174 /*
175 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
176 */ 176 */
177 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
178 } 178 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e973958d..f470e4ef993e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -110,7 +113,7 @@ void __init init_ISA_irqs(void)
110 legacy_pic->init(0); 113 legacy_pic->init(0);
111 114
112 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
113 set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
114} 117}
115 118
116void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -118,6 +121,12 @@ void __init init_IRQ(void)
118 int i; 121 int i;
119 122
120 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
121 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
122 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
123 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -164,14 +173,77 @@ static void __init smp_intr_init(void)
164 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
165 174
166 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
167 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
168 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
169 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
170 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
171 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
172 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
173 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
174 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
175 247
176 /* IPI for generic function call */ 248 /* IPI for generic function call */
177 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -243,7 +315,7 @@ void __init native_init_IRQ(void)
243 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 315 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
244 } 316 }
245 317
246 if (!acpi_ioapic) 318 if (!acpi_ioapic && !of_ioapic)
247 setup_irq(2, &irq2); 319 setup_irq(2, &irq2);
248 320
249#ifdef CONFIG_X86_32 321#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a4130005028a..dba0b36941a5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno)
278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
279 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
280 /* 280 /*
281 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
282 * remove failure. 282 * remove failure.
283 */ 283 */
284 return -1; 284 return -1;
@@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
533 } 533 }
534 return NOTIFY_DONE; 534 return NOTIFY_DONE;
535 535
536 case DIE_NMIWATCHDOG:
537 if (atomic_read(&kgdb_active) != -1) {
538 /* KGDB CPU roundup: */
539 kgdb_nmicallback(raw_smp_processor_id(), regs);
540 return NOTIFY_STOP;
541 }
542 /* Enter debugger: */
543 break;
544
545 case DIE_DEBUG: 536 case DIE_DEBUG:
546 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
547 if (user_mode(regs)) 538 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477b3f62..c969fd9d1566 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1277 return 0; 1277 return 0;
1278 1278
1279 /*
1280 * Do not optimize in the entry code due to the unstable
1281 * stack handling.
1282 */
1283 if ((paddr >= (unsigned long )__entry_text_start) &&
1284 (paddr < (unsigned long )__entry_text_end))
1285 return 0;
1286
1279 /* Check there is enough space for a relative jump. */ 1287 /* Check there is enough space for a relative jump. */
1280 if (size - offset < RELATIVEJUMP_SIZE) 1288 if (size - offset < RELATIVEJUMP_SIZE)
1281 return 0; 1289 return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8dc44662394b..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
493 native_smp_prepare_boot_cpu(); 493 native_smp_prepare_boot_cpu();
494} 494}
495 495
496static void kvm_guest_cpu_online(void *dummy) 496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{ 497{
498 kvm_guest_cpu_init(); 498 kvm_guest_cpu_init();
499} 499}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 0fe6d1a66c38..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static void * 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
160{ 158{
161 unsigned int total_size; 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 160 unsigned int max_size, actual_size;
163 void *mc; 161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
164 177
165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); 178 actual_size = buf[4] + (buf[5] << 8);
166 179
167 if (section_hdr[0] != UCODE_UCODE_TYPE) { 180 if (actual_size > size || actual_size > max_size) {
168 pr_err("error: invalid type field in container file section header\n"); 181 pr_err("section size mismatch\n");
169 return NULL; 182 return 0;
170 } 183 }
171 184
172 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 185 return actual_size;
186}
173 187
174 if (total_size > size || total_size > UCODE_MAX_SIZE) { 188static struct microcode_header_amd *
175 pr_err("error: size mismatch\n"); 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
176 return NULL; 190{
191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0;
193
194 if (buf[0] != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n");
196 goto out;
177 } 197 }
178 198
179 mc = vzalloc(UCODE_MAX_SIZE); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
202
203 mc = vzalloc(actual_size);
180 if (!mc) 204 if (!mc)
181 return NULL; 205 goto out;
182 206
183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
185 209
210out:
186 return mc; 211 return mc;
187} 212}
188 213
189static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
190{ 215{
191 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
192 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
193 unsigned long size; 218 unsigned int size = ibuf[2];
194 219
195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
196 221 pr_err("empty section/"
197 size = buf_pos[2]; 222 "invalid type field in container file section header\n");
198 223 return -EINVAL;
199 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
200 pr_err("error: invalid type field in container file section header\n");
201 return 0;
202 } 224 }
203 225
204 equiv_cpu_table = vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
205 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
206 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
207 return 0; 229 return -ENOMEM;
208 } 230 }
209 231
210 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
211 get_ucode_data(equiv_cpu_table, buf, size);
212 233
213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
214} 235}
@@ -223,16 +244,16 @@ static enum ucode_state
223generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
224{ 245{
225 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
226 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
227 void *new_mc = NULL; 251 void *new_mc = NULL;
228 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
229 int new_rev = uci->cpu_sig.rev;
230 unsigned int leftover;
231 unsigned long offset;
232 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
233 254
234 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
235 if (!offset) { 256 if (offset < 0) {
236 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
237 return UCODE_ERROR; 258 return UCODE_ERROR;
238 } 259 }
@@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
241 leftover = size - offset; 262 leftover = size - offset;
242 263
243 while (leftover) { 264 while (leftover) {
244 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
245 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
246
247 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
248 if (!mc)
249 break; 267 break;
250 268
251 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
252 if (get_matching_microcode(cpu, mc, new_rev)) {
253 vfree(new_mc); 270 vfree(new_mc);
254 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
255 new_mc = mc; 272 new_mc = mc_hdr;
256 } else 273 } else
257 vfree(mc); 274 vfree(mc_hdr);
258 275
259 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
260 leftover -= mc_size; 277 leftover -= mc_size;
261 } 278 }
262 279
263 if (new_mc) { 280 if (!new_mc) {
264 if (!leftover) {
265 vfree(uci->mc);
266 uci->mc = new_mc;
267 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
268 cpu, new_rev, uci->cpu_sig.rev);
269 } else {
270 vfree(new_mc);
271 state = UCODE_ERROR;
272 }
273 } else
274 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
275 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
276 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
277 297
278 return state; 298 return state;
279} 299}
280 300
281static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
282{ 302{
283 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
284 const struct firmware *firmware; 304 const struct firmware *fw;
285 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
286 306
287 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
288 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
289 return UCODE_NFOUND; 309 goto out;
290 } 310 }
291 311
292 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
293 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
294 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
295 return UCODE_ERROR; 315 goto fw_release;
296 } 316 }
297 317
298 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
299 319
300 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
301 322
323out:
302 return ret; 324 return ret;
303} 325}
304 326
@@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
319 341
320static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
321 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
322 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
323 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
324 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
325 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 1cca374a2bac..87af68e0e1e1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -417,8 +417,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
417 if (err) 417 if (err)
418 return err; 418 return err;
419 419
420 if (microcode_init_cpu(cpu) == UCODE_ERROR) 420 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
421 err = -EINVAL; 421 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
422 return -EINVAL;
423 }
422 424
423 return err; 425 return err;
424} 426}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 01b0f6d06451..6f789a887c06 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -883,7 +883,7 @@ static int __init update_mp_table(void)
883 883
884 if (!mpc_new_phys) { 884 if (!mpc_new_phys) {
885 unsigned char old, new; 885 unsigned char old, new;
886 /* check if we can change the postion */ 886 /* check if we can change the position */
887 mpc->checksum = 0; 887 mpc->checksum = 0;
888 old = mpf_checksum((unsigned char *)mpc, mpc->length); 888 old = mpf_checksum((unsigned char *)mpc, mpc->length);
889 mpc->checksum = 0xff; 889 mpc->checksum = 0xff;
@@ -892,7 +892,7 @@ static int __init update_mp_table(void)
892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
893 return 0; 893 return 0;
894 } 894 }
895 printk(KERN_INFO "use in-positon replacing\n"); 895 printk(KERN_INFO "use in-position replacing\n");
896 } else { 896 } else {
897 mpf->physptr = mpc_new_phys; 897 mpf->physptr = mpc_new_phys;
898 mpc_new = phys_to_virt(mpc_new_phys); 898 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index f56a117cef68..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1279 1279
1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1281 /* 1281 /*
1282 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1283 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1284 */ 1284 */
1285 return 1; 1285 return 1;
@@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1295 1295
1296/* 1296/*
1297 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1298 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1299 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1300 */ 1300 */
1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff4554198981..d46cbe46b7ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -87,7 +87,7 @@ void exit_thread(void)
87void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
88{ 88{
89 show_registers(regs); 89 show_registers(regs);
90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
91} 91}
92 92
93void show_regs_common(void) 93void show_regs_common(void)
@@ -110,12 +110,9 @@ void show_regs_common(void)
110 init_utsname()->release, 110 init_utsname()->release,
111 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
112 init_utsname()->version); 112 init_utsname()->version);
113 printk(KERN_CONT " "); 113 printk(KERN_CONT " %s %s", vendor, product);
114 printk(KERN_CONT "%s %s", vendor, product); 114 if (board)
115 if (board) { 115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "/");
117 printk(KERN_CONT "%s", board);
118 }
119 printk(KERN_CONT "\n"); 116 printk(KERN_CONT "\n");
120} 117}
121 118
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 715037caeb43..d3ce37edb54d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -303,68 +303,16 @@ static int __init reboot_init(void)
303} 303}
304core_initcall(reboot_init); 304core_initcall(reboot_init);
305 305
306/* The following code and data reboots the machine by switching to real 306extern const unsigned char machine_real_restart_asm[];
307 mode and jumping to the BIOS reset entry point, as if the CPU has 307extern const u64 machine_real_restart_gdt[3];
308 really been reset. The previous version asked the keyboard
309 controller to pulse the CPU reset line, which is more thorough, but
310 doesn't work with at least one type of 486 motherboard. It is easy
311 to stop this code working; hence the copious comments. */
312static const unsigned long long
313real_mode_gdt_entries [3] =
314{
315 0x0000000000000000ULL, /* Null descriptor */
316 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
317 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
318};
319 308
320static const struct desc_ptr 309void machine_real_restart(unsigned int type)
321real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
322real_mode_idt = { 0x3ff, 0 };
323
324/* This is 16-bit protected mode code to disable paging and the cache,
325 switch to real mode and jump to the BIOS reset code.
326
327 The instruction that switches to real mode by writing to CR0 must be
328 followed immediately by a far jump instruction, which set CS to a
329 valid value for real mode, and flushes the prefetch queue to avoid
330 running instructions that have already been decoded in protected
331 mode.
332
333 Clears all the flags except ET, especially PG (paging), PE
334 (protected-mode enable) and TS (task switch for coprocessor state
335 save). Flushes the TLB after paging has been disabled. Sets CD and
336 NW, to disable the cache on a 486, and invalidates the cache. This
337 is more like the state of a 486 after reset. I don't know if
338 something else should be done for other chips.
339
340 More could be done here to set up the registers as if a CPU reset had
341 occurred; hopefully real BIOSs don't assume much. */
342static const unsigned char real_mode_switch [] =
343{
344 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
345 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
346 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
347 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
348 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
349 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
350 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
351 0x74, 0x02, /* jz f */
352 0x0f, 0x09, /* wbinvd */
353 0x24, 0x10, /* f: andb $0x10,al */
354 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
355};
356static const unsigned char jump_to_bios [] =
357{ 310{
358 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 311 void *restart_va;
359}; 312 unsigned long restart_pa;
313 void (*restart_lowmem)(unsigned int);
314 u64 *lowmem_gdt;
360 315
361/*
362 * Switch to real mode and then execute the code
363 * specified by the code and length parameters.
364 * We assume that length will aways be less that 100!
365 */
366void machine_real_restart(const unsigned char *code, int length)
367{
368 local_irq_disable(); 316 local_irq_disable();
369 317
370 /* Write zero to CMOS register number 0x0f, which the BIOS POST 318 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -392,41 +340,23 @@ void machine_real_restart(const unsigned char *code, int length)
392 too. */ 340 too. */
393 *((unsigned short *)0x472) = reboot_mode; 341 *((unsigned short *)0x472) = reboot_mode;
394 342
395 /* For the switch to real mode, copy some code to low memory. It has 343 /* Patch the GDT in the low memory trampoline */
396 to be in the first 64k because it is running in 16-bit mode, and it 344 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
397 has to have the same physical and virtual address, because it turns 345
398 off paging. Copy it near the end of the first page, out of the way 346 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
399 of BIOS variables. */ 347 restart_pa = virt_to_phys(restart_va);
400 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 348 restart_lowmem = (void (*)(unsigned int))restart_pa;
401 real_mode_switch, sizeof (real_mode_switch)); 349
402 memcpy((void *)(0x1000 - 100), code, length); 350 /* GDT[0]: GDT self-pointer */
403 351 lowmem_gdt[0] =
404 /* Set up the IDT for real mode. */ 352 (u64)(sizeof(machine_real_restart_gdt) - 1) +
405 load_idt(&real_mode_idt); 353 ((u64)virt_to_phys(lowmem_gdt) << 16);
406 354 /* GDT[1]: 64K real mode code segment */
407 /* Set up a GDT from which we can load segment descriptors for real 355 lowmem_gdt[1] =
408 mode. The GDT is not used in real mode; it is just needed here to 356 GDT_ENTRY(0x009b, restart_pa, 0xffff);
409 prepare the descriptors. */ 357
410 load_gdt(&real_mode_gdt); 358 /* Jump to the identity-mapped low memory code */
411 359 restart_lowmem(type);
412 /* Load the data segment registers, and thus the descriptors ready for
413 real mode. The base address of each segment is 0x100, 16 times the
414 selector value being loaded here. This is so that the segment
415 registers don't have to be reloaded after switching to real mode:
416 the values are consistent for real mode operation already. */
417 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
418 "\tmovl %%eax,%%ds\n"
419 "\tmovl %%eax,%%es\n"
420 "\tmovl %%eax,%%fs\n"
421 "\tmovl %%eax,%%gs\n"
422 "\tmovl %%eax,%%ss" : : : "eax");
423
424 /* Jump to the 16-bit code that we copied earlier. It disables paging
425 and the cache, switches to real mode, and jumps to the BIOS reset
426 entry point. */
427 __asm__ __volatile__ ("ljmp $0x0008,%0"
428 :
429 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
430} 360}
431#ifdef CONFIG_APM_MODULE 361#ifdef CONFIG_APM_MODULE
432EXPORT_SYMBOL(machine_real_restart); 362EXPORT_SYMBOL(machine_real_restart);
@@ -581,7 +511,7 @@ static void native_machine_emergency_restart(void)
581 511
582#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
583 case BOOT_BIOS: 513 case BOOT_BIOS:
584 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 514 machine_real_restart(MRR_BIOS);
585 515
586 reboot_type = BOOT_KBD; 516 reboot_type = BOOT_KBD;
587 break; 517 break;
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..29092b38d816
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $1b, %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw dispatch_table(%ebx,%eax,2),%ax
32 movw %ax, 101f(%ebx)
33 movw %cx, 102f(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl machine_real_restart_idt(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl machine_real_restart_gdt(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 6f39cab052d5..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3cfe26c0252..32bd87cbf982 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h> 115#include <asm/alternative.h>
116#include <asm/prom.h>
116 117
117/* 118/*
118 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -297,6 +298,9 @@ static void __init init_gbpages(void)
297static inline void init_gbpages(void) 298static inline void init_gbpages(void)
298{ 299{
299} 300}
301static void __init cleanup_highmap(void)
302{
303}
300#endif 304#endif
301 305
302static void __init reserve_brk(void) 306static void __init reserve_brk(void)
@@ -429,16 +433,30 @@ static void __init parse_setup_data(void)
429 return; 433 return;
430 pa_data = boot_params.hdr.setup_data; 434 pa_data = boot_params.hdr.setup_data;
431 while (pa_data) { 435 while (pa_data) {
432 data = early_memremap(pa_data, PAGE_SIZE); 436 u32 data_len, map_len;
437
438 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
439 (u64)sizeof(struct setup_data));
440 data = early_memremap(pa_data, map_len);
441 data_len = data->len + sizeof(struct setup_data);
442 if (data_len > map_len) {
443 early_iounmap(data, map_len);
444 data = early_memremap(pa_data, data_len);
445 map_len = data_len;
446 }
447
433 switch (data->type) { 448 switch (data->type) {
434 case SETUP_E820_EXT: 449 case SETUP_E820_EXT:
435 parse_e820_ext(data, pa_data); 450 parse_e820_ext(data);
451 break;
452 case SETUP_DTB:
453 add_dtb(pa_data);
436 break; 454 break;
437 default: 455 default:
438 break; 456 break;
439 } 457 }
440 pa_data = data->next; 458 pa_data = data->next;
441 early_iounmap(data, PAGE_SIZE); 459 early_iounmap(data, map_len);
442 } 460 }
443} 461}
444 462
@@ -680,15 +698,6 @@ static int __init parse_reservelow(char *p)
680 698
681early_param("reservelow", parse_reservelow); 699early_param("reservelow", parse_reservelow);
682 700
683static u64 __init get_max_mapped(void)
684{
685 u64 end = max_pfn_mapped;
686
687 end <<= PAGE_SHIFT;
688
689 return end;
690}
691
692/* 701/*
693 * Determine if we were loaded by an EFI loader. If so, then we have also been 702 * Determine if we were loaded by an EFI loader. If so, then we have also been
694 * passed the efi memmap, systab, etc., so we should use these data structures 703 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -704,8 +713,6 @@ static u64 __init get_max_mapped(void)
704 713
705void __init setup_arch(char **cmdline_p) 714void __init setup_arch(char **cmdline_p)
706{ 715{
707 int acpi = 0;
708 int amd = 0;
709 unsigned long flags; 716 unsigned long flags;
710 717
711#ifdef CONFIG_X86_32 718#ifdef CONFIG_X86_32
@@ -922,6 +929,8 @@ void __init setup_arch(char **cmdline_p)
922 */ 929 */
923 reserve_brk(); 930 reserve_brk();
924 931
932 cleanup_highmap();
933
925 memblock.current_limit = get_max_mapped(); 934 memblock.current_limit = get_max_mapped();
926 memblock_x86_fill(); 935 memblock_x86_fill();
927 936
@@ -935,15 +944,8 @@ void __init setup_arch(char **cmdline_p)
935 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 944 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
936 max_pfn_mapped<<PAGE_SHIFT); 945 max_pfn_mapped<<PAGE_SHIFT);
937 946
938 reserve_trampoline_memory(); 947 setup_trampolines();
939 948
940#ifdef CONFIG_ACPI_SLEEP
941 /*
942 * Reserve low memory region for sleep support.
943 * even before init_memory_mapping
944 */
945 acpi_reserve_wakeup_memory();
946#endif
947 init_gbpages(); 949 init_gbpages();
948 950
949 /* max_pfn_mapped is updated here */ 951 /* max_pfn_mapped is updated here */
@@ -984,19 +986,7 @@ void __init setup_arch(char **cmdline_p)
984 986
985 early_acpi_boot_init(); 987 early_acpi_boot_init();
986 988
987#ifdef CONFIG_ACPI_NUMA 989 initmem_init();
988 /*
989 * Parse SRAT to discover nodes.
990 */
991 acpi = acpi_numa_init();
992#endif
993
994#ifdef CONFIG_AMD_NUMA
995 if (!acpi)
996 amd = !amd_numa_init(0, max_pfn);
997#endif
998
999 initmem_init(0, max_pfn, acpi, amd);
1000 memblock_find_dma_reserve(); 990 memblock_find_dma_reserve();
1001 dma32_reserve_bootmem(); 991 dma32_reserve_bootmem();
1002 992
@@ -1029,8 +1019,8 @@ void __init setup_arch(char **cmdline_p)
1029 * Read APIC and some other early information from ACPI tables. 1019 * Read APIC and some other early information from ACPI tables.
1030 */ 1020 */
1031 acpi_boot_init(); 1021 acpi_boot_init();
1032
1033 sfi_init(); 1022 sfi_init();
1023 x86_dtb_init();
1034 1024
1035 /* 1025 /*
1036 * get boot-time SMP configuration: 1026 * get boot-time SMP configuration:
@@ -1040,9 +1030,7 @@ void __init setup_arch(char **cmdline_p)
1040 1030
1041 prefill_possible_map(); 1031 prefill_possible_map();
1042 1032
1043#ifdef CONFIG_X86_64
1044 init_cpu_to_node(); 1033 init_cpu_to_node();
1045#endif
1046 1034
1047 init_apic_mappings(); 1035 init_apic_mappings();
1048 ioapic_and_gsi_init(); 1036 ioapic_and_gsi_init();
@@ -1066,6 +1054,8 @@ void __init setup_arch(char **cmdline_p)
1066#endif 1054#endif
1067 x86_init.oem.banner(); 1055 x86_init.oem.banner();
1068 1056
1057 x86_init.timers.wallclock_init();
1058
1069 mcheck_init(); 1059 mcheck_init();
1070 1060
1071 local_irq_save(flags); 1061 local_irq_save(flags);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 002b79685f73..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void)
225 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
226 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
227#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
228#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
229 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
230 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
231 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
232#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
233 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
234 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void)
242 */ 247 */
243 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
244#endif 249#endif
245#endif
246 /* 250 /*
247 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
248 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
@@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void)
256 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
257 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
258#endif 262#endif
259#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
260 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
261#endif 268#endif
262 269
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 08776a953487..c2871d3c71b6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,6 +64,7 @@
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/mwait.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused)
355 cpu_idle(); 297 cpu_idle();
356} 298}
357 299
358#ifdef CONFIG_CPUMASK_OFFSTACK
359/* In this case, llc_shared_map is a pointer to a cpumask. */
360static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
361 const struct cpuinfo_x86 *src)
362{
363 struct cpumask *llc = dst->llc_shared_map;
364 *dst = *src;
365 dst->llc_shared_map = llc;
366}
367#else
368static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
369 const struct cpuinfo_x86 *src)
370{
371 *dst = *src;
372}
373#endif /* CONFIG_CPUMASK_OFFSTACK */
374
375/* 300/*
376 * The bootstrap kernel entry code has set these up. Save them for 301 * The bootstrap kernel entry code has set these up. Save them for
377 * a given CPU 302 * a given CPU
@@ -381,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id)
381{ 306{
382 struct cpuinfo_x86 *c = &cpu_data(id); 307 struct cpuinfo_x86 *c = &cpu_data(id);
383 308
384 copy_cpuinfo_x86(c, &boot_cpu_data); 309 *c = boot_cpu_data;
385 c->cpu_index = id; 310 c->cpu_index = id;
386 if (id != 0) 311 if (id != 0)
387 identify_secondary_cpu(c); 312 identify_secondary_cpu(c);
@@ -389,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id)
389 314
390static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 315static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
391{ 316{
392 struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
393 struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
394
395 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 317 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
396 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 318 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
397 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 319 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
398 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 320 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
399 cpumask_set_cpu(cpu1, c2->llc_shared_map); 321 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
400 cpumask_set_cpu(cpu2, c1->llc_shared_map); 322 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
401} 323}
402 324
403 325
@@ -414,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
414 336
415 if (cpu_has(c, X86_FEATURE_TOPOEXT)) { 337 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
416 if (c->phys_proc_id == o->phys_proc_id && 338 if (c->phys_proc_id == o->phys_proc_id &&
339 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
417 c->compute_unit_id == o->compute_unit_id) 340 c->compute_unit_id == o->compute_unit_id)
418 link_thread_siblings(cpu, i); 341 link_thread_siblings(cpu, i);
419 } else if (c->phys_proc_id == o->phys_proc_id && 342 } else if (c->phys_proc_id == o->phys_proc_id &&
@@ -425,7 +348,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
425 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 348 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
426 } 349 }
427 350
428 cpumask_set_cpu(cpu, c->llc_shared_map); 351 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
429 352
430 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { 353 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 354 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -436,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
436 for_each_cpu(i, cpu_sibling_setup_mask) { 359 for_each_cpu(i, cpu_sibling_setup_mask) {
437 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 360 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
438 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 361 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
439 cpumask_set_cpu(i, c->llc_shared_map); 362 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
440 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 363 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
441 } 364 }
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 365 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
443 cpumask_set_cpu(i, cpu_core_mask(cpu)); 366 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -476,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
476 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 399 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
477 return cpu_core_mask(cpu); 400 return cpu_core_mask(cpu);
478 else 401 else
479 return c->llc_shared_map; 402 return cpu_llc_shared_mask(cpu);
480} 403}
481 404
482static void impress_friends(void) 405static void impress_friends(void)
@@ -788,7 +711,7 @@ do_rest:
788 stack_start = c_idle.idle->thread.sp; 711 stack_start = c_idle.idle->thread.sp;
789 712
790 /* start_ip had better be page-aligned! */ 713 /* start_ip had better be page-aligned! */
791 start_ip = setup_trampoline(); 714 start_ip = trampoline_address();
792 715
793 /* So we see what's up */ 716 /* So we see what's up */
794 announce_cpu(cpu, apicid); 717 announce_cpu(cpu, apicid);
@@ -798,6 +721,8 @@ do_rest:
798 * the targeted processor. 721 * the targeted processor.
799 */ 722 */
800 723
724 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
725
801 atomic_set(&init_deasserted, 0); 726 atomic_set(&init_deasserted, 0);
802 727
803 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 728 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -851,8 +776,8 @@ do_rest:
851 pr_debug("CPU%d: has booted.\n", cpu); 776 pr_debug("CPU%d: has booted.\n", cpu);
852 else { 777 else {
853 boot_error = 1; 778 boot_error = 1;
854 if (*((volatile unsigned char *)trampoline_base) 779 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
855 == 0xA5) 780 == 0xA5A5A5A5)
856 /* trampoline started but...? */ 781 /* trampoline started but...? */
857 pr_err("CPU%d: Stuck ??\n", cpu); 782 pr_err("CPU%d: Stuck ??\n", cpu);
858 else 783 else
@@ -878,7 +803,7 @@ do_rest:
878 } 803 }
879 804
880 /* mark "stuck" area as not stuck */ 805 /* mark "stuck" area as not stuck */
881 *((volatile unsigned long *)trampoline_base) = 0; 806 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
882 807
883 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 808 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
884 /* 809 /*
@@ -945,6 +870,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
945 return 0; 870 return 0;
946} 871}
947 872
873/**
874 * arch_disable_smp_support() - disables SMP support for x86 at runtime
875 */
876void arch_disable_smp_support(void)
877{
878 disable_ioapic_support();
879}
880
948/* 881/*
949 * Fall back to non SMP mode after errors. 882 * Fall back to non SMP mode after errors.
950 * 883 *
@@ -960,7 +893,6 @@ static __init void disable_smp(void)
960 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 893 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
961 else 894 else
962 physid_set_mask_of_physid(0, &phys_cpu_present_map); 895 physid_set_mask_of_physid(0, &phys_cpu_present_map);
963 map_cpu_to_logical_apicid();
964 cpumask_set_cpu(0, cpu_sibling_mask(0)); 896 cpumask_set_cpu(0, cpu_sibling_mask(0));
965 cpumask_set_cpu(0, cpu_core_mask(0)); 897 cpumask_set_cpu(0, cpu_core_mask(0));
966} 898}
@@ -1045,7 +977,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1045 "(tell your hw vendor)\n"); 977 "(tell your hw vendor)\n");
1046 } 978 }
1047 smpboot_clear_io_apic(); 979 smpboot_clear_io_apic();
1048 arch_disable_smp_support(); 980 disable_ioapic_support();
1049 return -1; 981 return -1;
1050 } 982 }
1051 983
@@ -1089,21 +1021,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1089 1021
1090 preempt_disable(); 1022 preempt_disable();
1091 smp_cpu_index_default(); 1023 smp_cpu_index_default();
1092 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); 1024
1093 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1094 mb();
1095 /* 1025 /*
1096 * Setup boot CPU information 1026 * Setup boot CPU information
1097 */ 1027 */
1098 smp_store_cpu_info(0); /* Final full version of the data */ 1028 smp_store_cpu_info(0); /* Final full version of the data */
1099#ifdef CONFIG_X86_32 1029 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1100 boot_cpu_logical_apicid = logical_smp_processor_id(); 1030 mb();
1101#endif 1031
1102 current_thread_info()->cpu = 0; /* needed? */ 1032 current_thread_info()->cpu = 0; /* needed? */
1103 for_each_possible_cpu(i) { 1033 for_each_possible_cpu(i) {
1104 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1034 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1105 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1035 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1106 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1036 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1107 } 1037 }
1108 set_cpu_sibling_map(0); 1038 set_cpu_sibling_map(0);
1109 1039
@@ -1139,8 +1069,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1139 1069
1140 bsp_end_local_APIC_setup(); 1070 bsp_end_local_APIC_setup();
1141 1071
1142 map_cpu_to_logical_apicid();
1143
1144 if (apic->setup_portio_remap) 1072 if (apic->setup_portio_remap)
1145 apic->setup_portio_remap(); 1073 apic->setup_portio_remap();
1146 1074
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 938c8e10a19a..6515733a289d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,7 +73,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
@@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, regs, NULL, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786dc9b8f..abce34d5c79d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,7 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a375616d77f7..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,39 +2,41 @@
2#include <linux/memblock.h> 2#include <linux/memblock.h>
3 3
4#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
5#include <asm/pgtable.h> 6#include <asm/pgtable.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 phys_addr_t mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == MEMBLOCK_ERROR) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base); 38
39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
40 return 0;
40} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 075d130efcf9..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -129,6 +128,7 @@ no_longmode:
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ffe5755caa8b..9335bf7dd2e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
427 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
428 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
429 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
430 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
431 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
432 * then we discard the result and have another try. 432 * then we discard the result and have another try.
433 * 433 *
@@ -900,7 +900,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
900 * timer based, instead of loop based, we don't block the boot 900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done. 901 * process while this longer calibration is done.
902 * 902 *
903 * If there are any calibration anomolies (too many SMIs, etc), 903 * If there are any calibration anomalies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early 904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the 905 * calibration, we throw out the new calibration and use the
906 * early calibration. 906 * early calibration.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 0edefc19a113..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -18,7 +18,7 @@
18 * This file is expected to run in 32bit code. Currently: 18 * This file is expected to run in 32bit code. Currently:
19 * 19 *
20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication 21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup 22 * arch/x86/kernel/head_32.S: processor startup
23 * 23 *
24 * verify_cpu, returns the status of longmode and SSE in register %eax. 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf4700755184..624a2016198e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -230,7 +231,7 @@ SECTIONS
230 * output PHDR, so the next output section - .init.text - should 231 * output PHDR, so the next output section - .init.text - should
231 * start another segment - init. 232 * start another segment - init.
232 */ 233 */
233 PERCPU_VADDR(0, :percpu) 234 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
234#endif 235#endif
235 236
236 INIT_TEXT_SECTION(PAGE_SIZE) 237 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -240,6 +241,18 @@ SECTIONS
240 241
241 INIT_DATA_SECTION(16) 242 INIT_DATA_SECTION(16)
242 243
244 /*
245 * Code and data for a variety of lowlevel trampolines, to be
246 * copied into base memory (< 1 MiB) during initialization.
247 * Since it is copied early, the main copy can be discarded
248 * afterwards.
249 */
250 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
251 x86_trampoline_start = .;
252 *(.x86_trampoline)
253 x86_trampoline_end = .;
254 }
255
243 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 256 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
244 __x86_cpu_dev_start = .; 257 __x86_cpu_dev_start = .;
245 *(.x86_cpu_dev.init) 258 *(.x86_cpu_dev.init)
@@ -291,6 +304,7 @@ SECTIONS
291 *(.iommu_table) 304 *(.iommu_table)
292 __iommu_table_end = .; 305 __iommu_table_end = .;
293 } 306 }
307
294 . = ALIGN(8); 308 . = ALIGN(8);
295 /* 309 /*
296 * .exit.text is discard at runtime, not link time, to deal with 310 * .exit.text is discard at runtime, not link time, to deal with
@@ -305,7 +319,7 @@ SECTIONS
305 } 319 }
306 320
307#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
308 PERCPU(THREAD_SIZE) 322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
309#endif 323#endif
310 324
311 . = ALIGN(PAGE_SIZE); 325 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ceb2911aa439..c11514e9128b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
70 .setup_percpu_clockev = setup_boot_APIC_clock, 70 .setup_percpu_clockev = setup_boot_APIC_clock,
71 .tsc_pre_init = x86_init_noop, 71 .tsc_pre_init = x86_init_noop,
72 .timer_init = hpet_time_init, 72 .timer_init = hpet_time_init,
73 .wallclock_init = x86_init_noop,
73 }, 74 },
74 75
75 .iommu = { 76 .iommu = {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 547128546cc3..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
78/* Misc flags */ 78/* Misc flags */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */
79#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
80#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
81#define Undefined (1<<25) /* No Such Instruction */ 82#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
877 if (selector & 1 << 2) { 878 if (selector & 1 << 2) {
878 struct desc_struct desc; 879 struct desc_struct desc;
879 memset (dt, 0, sizeof *dt); 880 memset (dt, 0, sizeof *dt);
880 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
882 ctxt->vcpu))
881 return; 883 return;
882 884
883 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
929 return ret; 931 return ret;
930} 932}
931 933
934/* Does not support long mode */
932static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 935static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
933 struct x86_emulate_ops *ops, 936 struct x86_emulate_ops *ops,
934 u16 selector, int seg) 937 u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1040 } 1043 }
1041load: 1044load:
1042 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1045 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1043 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1044 return X86EMUL_CONTINUE; 1047 return X86EMUL_CONTINUE;
1045exception: 1048exception:
1046 emulate_exception(ctxt, err_vec, err_code, true); 1049 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1560 struct desc_struct *ss) 1563 struct desc_struct *ss)
1561{ 1564{
1562 memset(cs, 0, sizeof(struct desc_struct)); 1565 memset(cs, 0, sizeof(struct desc_struct));
1563 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
1564 memset(ss, 0, sizeof(struct desc_struct)); 1567 memset(ss, 0, sizeof(struct desc_struct));
1565 1568
1566 cs->l = 0; /* will be adjusted later */ 1569 cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1607 cs.d = 0; 1610 cs.d = 0;
1608 cs.l = 1; 1611 cs.l = 1;
1609 } 1612 }
1610 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1611 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1612 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1613 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1614 1617
1615 c->regs[VCPU_REGS_RCX] = c->eip; 1618 c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1679 cs.l = 1; 1682 cs.l = 1;
1680 } 1683 }
1681 1684
1682 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1683 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1684 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1685 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1686 1689
1687 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1736 cs_sel |= SELECTOR_RPL_MASK; 1739 cs_sel |= SELECTOR_RPL_MASK;
1737 ss_sel |= SELECTOR_RPL_MASK; 1740 ss_sel |= SELECTOR_RPL_MASK;
1738 1741
1739 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1740 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1741 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1742 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1743 1746
1744 c->eip = c->regs[VCPU_REGS_RDX]; 1747 c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1764 u16 port, u16 len) 1767 u16 port, u16 len)
1765{ 1768{
1766 struct desc_struct tr_seg; 1769 struct desc_struct tr_seg;
1770 u32 base3;
1767 int r; 1771 int r;
1768 u16 io_bitmap_ptr; 1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
1769 u8 perm, bit_idx = port & 0x7;
1770 unsigned mask = (1 << len) - 1; 1773 unsigned mask = (1 << len) - 1;
1774 unsigned long base;
1771 1775
1772 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
1773 if (!tr_seg.p) 1777 if (!tr_seg.p)
1774 return false; 1778 return false;
1775 if (desc_limit_scaled(&tr_seg) < 103) 1779 if (desc_limit_scaled(&tr_seg) < 103)
1776 return false; 1780 return false;
1777 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 1781 base = get_desc_base(&tr_seg);
1778 ctxt->vcpu, NULL); 1782#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32;
1784#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
1779 if (r != X86EMUL_CONTINUE) 1786 if (r != X86EMUL_CONTINUE)
1780 return false; 1787 return false;
1781 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1782 return false; 1789 return false;
1783 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
1784 &perm, 1, ctxt->vcpu, NULL); 1791 NULL);
1785 if (r != X86EMUL_CONTINUE) 1792 if (r != X86EMUL_CONTINUE)
1786 return false; 1793 return false;
1787 if ((perm >> bit_idx) & mask) 1794 if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2126 } 2133 }
2127 2134
2128 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2129 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
2130 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); 2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2131 2138
2132 if (has_error_code) { 2139 if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
2365 D(SrcMem16 | ModRM | Mov | Priv), 2372 D(SrcMem16 | ModRM | Mov | Priv),
2366 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2367}, { 2374}, {
2368 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), 2375 D(SrcNone | ModRM | Priv | VendorSpecific), N,
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific),
2369 D(SrcNone | ModRM | DstMem | Mov), N, 2377 D(SrcNone | ModRM | DstMem | Mov), N,
2370 D(SrcMem16 | ModRM | Mov | Priv), N, 2378 D(SrcMem16 | ModRM | Mov | Priv), N,
2371} }; 2379} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
2489static struct opcode twobyte_table[256] = { 2497static struct opcode twobyte_table[256] = {
2490 /* 0x00 - 0x0F */ 2498 /* 0x00 - 0x0F */
2491 N, GD(0, &group7), N, N, 2499 N, GD(0, &group7), N, N,
2492 N, D(ImplicitOps), D(ImplicitOps | Priv), N, 2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
2493 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2494 N, D(ImplicitOps | ModRM), N, N, 2502 N, D(ImplicitOps | ModRM), N, N,
2495 /* 0x10 - 0x1F */ 2503 /* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
2502 /* 0x30 - 0x3F */ 2510 /* 0x30 - 0x3F */
2503 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2504 D(ImplicitOps | Priv), N, 2512 D(ImplicitOps | Priv), N,
2505 D(ImplicitOps), D(ImplicitOps | Priv), N, N, 2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N,
2506 N, N, N, N, N, N, N, N, 2515 N, N, N, N, N, N, N, N,
2507 /* 0x40 - 0x4F */ 2516 /* 0x40 - 0x4F */
2508 X16(D(DstReg | SrcMem | ModRM | Mov)), 2517 X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
2741 if (c->d == 0 || (c->d & Undefined)) 2750 if (c->d == 0 || (c->d & Undefined))
2742 return -1; 2751 return -1;
2743 2752
2753 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
2754 return -1;
2755
2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2756 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2745 c->op_bytes = 8; 2757 c->op_bytes = 8;
2746 2758
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
70 kvm_make_request(KVM_REQ_EVENT, found); 67 kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
75static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
76{ 73{
77 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
78 s->isr_ack |= (1 << irq);
79 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
80 irq += 8; 76 irq += 8;
81 /* 77 /*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
89 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
90} 86}
91 87
92void kvm_pic_clear_isr_ack(struct kvm *kvm)
93{
94 struct kvm_pic *s = pic_irqchip(kvm);
95
96 pic_lock(s);
97 s->pics[0].isr_ack = 0xff;
98 s->pics[1].isr_ack = 0xff;
99 pic_unlock(s);
100}
101
102/* 88/*
103 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
104 */ 90 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->irr = 0; 267 s->irr = 0;
282 s->imr = 0; 268 s->imr = 0;
283 s->isr = 0; 269 s->isr = 0;
284 s->isr_ack = 0xff;
285 s->priority_add = 0; 270 s->priority_add = 0;
286 s->irq_base = 0; 271 s->irq_base = 0;
287 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
545 */ 530 */
546static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
547{ 532{
548 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
549 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
550 int irq = pic_get_irq(&s->pics[0]);
551 534
552 s->output = level; 535 if (!s->output)
553 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
554 s->pics[0].isr_ack &= ~(1 << irq);
555 s->wakeup_needed = true; 536 s->wakeup_needed = true;
556 } 537 s->output = level;
557} 538}
558 539
559static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 557 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 558 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
580 559
581 /* 560 /*
582 * Initialize PIO device 561 * Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
417 case APIC_DM_INIT: 417 case APIC_DM_INIT:
418 if (level) { 418 if (level) {
419 result = 1; 419 result = 1;
420 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
421 printk(KERN_DEBUG
422 "INIT on a runnable vcpu %d\n",
423 vcpu->vcpu_id);
424 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
425 kvm_make_request(KVM_REQ_EVENT, vcpu); 421 kvm_make_request(KVM_REQ_EVENT, vcpu);
426 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
875 871
876 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
877 873
878 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
879 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
880 876
881 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
882} 878}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1065 1061
1066 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1067 1063
1068 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1069 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1070 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1071 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1072 goto nomem_free_apic; 1068 goto nomem_free_apic;
1073 } 1069 }
1074 apic->regs = page_address(apic->regs_page);
1075 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1076 1071
1077 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..22fae7593ee7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
111#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119 116
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
123#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125 122
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
379static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
380 int min) 375 int min)
381{ 376{
382 struct page *page; 377 void *page;
383 378
384 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
385 return 0; 380 return 0;
386 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
387 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
388 if (!page) 383 if (!page)
389 return -ENOMEM; 384 return -ENOMEM;
390 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
391 } 386 }
392 return 0; 387 return 0;
393} 388}
@@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 549 return ret;
555} 550}
556 551
557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
558{ 555{
559 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
560 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 557
561 if (slot && slot->dirty_bitmap) 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
562 return true; 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
563 return false; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
564} 569}
565 570
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1032 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
1033 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
1034 list_del(&sp->link); 1039 list_del(&sp->link);
1035 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
1036 if (!sp->role.direct) 1041 if (!sp->role.direct)
1037 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
1038 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
1039 kvm_mod_used_mmu_pages(kvm, -1); 1044 kvm_mod_used_mmu_pages(kvm, -1);
1040} 1045}
@@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1199{ 1204{
1200} 1205}
1201 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq)
1210{
1211 WARN_ON(1);
1212}
1213
1202#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1203 1215
1204struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2150{ 2162{
2151} 2163}
2152 2164
2153static struct kvm_memory_slot *
2154pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2155{
2156 struct kvm_memory_slot *slot;
2157
2158 slot = gfn_to_memslot(vcpu->kvm, gfn);
2159 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2160 (no_dirty_log && slot->dirty_bitmap))
2161 slot = NULL;
2162
2163 return slot;
2164}
2165
2166static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2167 bool no_dirty_log) 2166 bool no_dirty_log)
2168{ 2167{
2169 struct kvm_memory_slot *slot; 2168 struct kvm_memory_slot *slot;
2170 unsigned long hva; 2169 unsigned long hva;
2171 2170
2172 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); 2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2173 if (!slot) { 2172 if (!slot) {
2174 get_page(bad_page); 2173 get_page(bad_page);
2175 return page_to_pfn(bad_page); 2174 return page_to_pfn(bad_page);
@@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2190 gfn_t gfn; 2189 gfn_t gfn;
2191 2190
2192 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2193 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) 2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2194 return -1; 2193 return -1;
2195 2194
2196 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2804 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2805 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2806 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2807 context->root_level = 0; 2807 context->root_level = 0;
2808 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
@@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2933 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2936 context->free = paging_free; 2937 context->free = paging_free;
2937 context->root_level = level; 2938 context->root_level = level;
2938 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
@@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2961 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2962 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2963 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2964 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2965 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2966 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
@@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2985 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2986 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2987 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2988 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2989 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2990 context->direct_map = true; 2993 context->direct_map = true;
@@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3089 3092
3090static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3091{ 3094{
3092 vcpu->arch.update_pte.pfn = bad_pfn;
3093
3094 if (mmu_is_nested(vcpu)) 3095 if (mmu_is_nested(vcpu))
3095 return init_kvm_nested_mmu(vcpu); 3096 return init_kvm_nested_mmu(vcpu);
3096 else if (tdp_enabled) 3097 else if (tdp_enabled)
@@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3164static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3165 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp,
3166 u64 *spte, 3167 u64 *spte,
3167 const void *new) 3168 const void *new, unsigned long mmu_seq)
3168{ 3169{
3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170 ++vcpu->kvm->stat.mmu_pde_zapped; 3171 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3172 } 3173 }
3173 3174
3174 ++vcpu->kvm->stat.mmu_pte_updated; 3175 ++vcpu->kvm->stat.mmu_pte_updated;
3175 if (!sp->role.cr4_pae) 3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3176 paging32_update_pte(vcpu, sp, spte, new);
3177 else
3178 paging64_update_pte(vcpu, sp, spte, new);
3179} 3177}
3180 3178
3181static bool need_remote_flush(u64 old, u64 new) 3179static bool need_remote_flush(u64 old, u64 new)
@@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3210 return !!(spte && (*spte & shadow_accessed_mask)); 3208 return !!(spte && (*spte & shadow_accessed_mask));
3211} 3209}
3212 3210
3213static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 u64 gpte)
3215{
3216 gfn_t gfn;
3217 pfn_t pfn;
3218
3219 if (!is_present_gpte(gpte))
3220 return;
3221 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3222
3223 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3224 smp_rmb();
3225 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3226
3227 if (is_error_pfn(pfn)) {
3228 kvm_release_pfn_clean(pfn);
3229 return;
3230 }
3231 vcpu->arch.update_pte.gfn = gfn;
3232 vcpu->arch.update_pte.pfn = pfn;
3233}
3234
3235static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3211static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3236{ 3212{
3237 u64 *spte = vcpu->arch.last_pte_updated; 3213 u64 *spte = vcpu->arch.last_pte_updated;
@@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3253 struct kvm_mmu_page *sp; 3229 struct kvm_mmu_page *sp;
3254 struct hlist_node *node; 3230 struct hlist_node *node;
3255 LIST_HEAD(invalid_list); 3231 LIST_HEAD(invalid_list);
3256 u64 entry, gentry; 3232 unsigned long mmu_seq;
3257 u64 *spte; 3233 u64 entry, gentry, *spte;
3258 unsigned offset = offset_in_page(gpa); 3234 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3259 unsigned pte_size; 3235 int level, npte, invlpg_counter, r, flooded = 0;
3260 unsigned page_offset;
3261 unsigned misaligned;
3262 unsigned quadrant;
3263 int level;
3264 int flooded = 0;
3265 int npte;
3266 int r;
3267 int invlpg_counter;
3268 bool remote_flush, local_flush, zap_page; 3236 bool remote_flush, local_flush, zap_page;
3269 3237
3270 zap_page = remote_flush = local_flush = false; 3238 zap_page = remote_flush = local_flush = false;
3239 offset = offset_in_page(gpa);
3271 3240
3272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3241 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3273 3242
@@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3275 3244
3276 /* 3245 /*
3277 * Assume that the pte write on a page table of the same type 3246 * Assume that the pte write on a page table of the same type
3278 * as the current vcpu paging mode. This is nearly always true 3247 * as the current vcpu paging mode since we update the sptes only
3279 * (might be false while changing modes). Note it is verified later 3248 * when they have the same mode.
3280 * by update_pte().
3281 */ 3249 */
3282 if ((is_pae(vcpu) && bytes == 4) || !new) { 3250 if ((is_pae(vcpu) && bytes == 4) || !new) {
3283 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3251 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3303 break; 3271 break;
3304 } 3272 }
3305 3273
3306 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3307 spin_lock(&vcpu->kvm->mmu_lock); 3277 spin_lock(&vcpu->kvm->mmu_lock);
3308 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3309 gentry = 0; 3279 gentry = 0;
3310 kvm_mmu_access_page(vcpu, gfn);
3311 kvm_mmu_free_some_pages(vcpu); 3280 kvm_mmu_free_some_pages(vcpu);
3312 ++vcpu->kvm->stat.mmu_pte_write; 3281 ++vcpu->kvm->stat.mmu_pte_write;
3313 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3282 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3314 if (guest_initiated) { 3283 if (guest_initiated) {
3284 kvm_mmu_access_page(vcpu, gfn);
3315 if (gfn == vcpu->arch.last_pt_write_gfn 3285 if (gfn == vcpu->arch.last_pt_write_gfn
3316 && !last_updated_pte_accessed(vcpu)) { 3286 && !last_updated_pte_accessed(vcpu)) {
3317 ++vcpu->arch.last_pt_write_count; 3287 ++vcpu->arch.last_pt_write_count;
@@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3375 if (gentry && 3345 if (gentry &&
3376 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3377 & mask.word)) 3347 & mask.word))
3378 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 mmu_seq);
3379 if (!remote_flush && need_remote_flush(entry, *spte)) 3350 if (!remote_flush && need_remote_flush(entry, *spte))
3380 remote_flush = true; 3351 remote_flush = true;
3381 ++spte; 3352 ++spte;
@@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3385 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3356 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3357 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3358 spin_unlock(&vcpu->kvm->mmu_lock);
3388 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3389 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3390 vcpu->arch.update_pte.pfn = bad_pfn;
3391 }
3392} 3359}
3393 3360
3394int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3361int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3538 if (!test_bit(slot, sp->slot_bitmap)) 3505 if (!test_bit(slot, sp->slot_bitmap))
3539 continue; 3506 continue;
3540 3507
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3544 pt = sp->spt; 3508 pt = sp->spt;
3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3509 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 if (!is_shadow_present_pte(pt[i]) ||
3511 !is_last_spte(pt[i], sp->role.level))
3512 continue;
3513
3514 if (is_large_pte(pt[i])) {
3515 drop_spte(kvm, &pt[i],
3516 shadow_trap_nonpresent_pte);
3517 --kvm->stat.lpages;
3518 continue;
3519 }
3520
3546 /* avoid RMW */ 3521 /* avoid RMW */
3547 if (is_writable_pte(pt[i])) 3522 if (is_writable_pte(pt[i]))
3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3523 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 }
3549 } 3525 }
3550 kvm_flush_remote_tlbs(kvm); 3526 kvm_flush_remote_tlbs(kvm);
3551} 3527}
@@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 if (nr_to_scan == 0) 3559 if (nr_to_scan == 0)
3584 goto out; 3560 goto out;
3585 3561
3586 spin_lock(&kvm_lock); 3562 raw_spin_lock(&kvm_lock);
3587 3563
3588 list_for_each_entry(kvm, &vm_list, vm_list) { 3564 list_for_each_entry(kvm, &vm_list, vm_list) {
3589 int idx, freed_pages; 3565 int idx, freed_pages;
@@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3606 if (kvm_freed) 3582 if (kvm_freed)
3607 list_move_tail(&kvm_freed->vm_list, &vm_list); 3583 list_move_tail(&kvm_freed->vm_list, &vm_list);
3608 3584
3609 spin_unlock(&kvm_lock); 3585 raw_spin_unlock(&kvm_lock);
3610 3586
3611out: 3587out:
3612 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 3588 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..c6397795d865 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ no_present:
327} 325}
328 326
329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 u64 *spte, const void *pte) 328 u64 *spte, const void *pte, unsigned long mmu_seq)
331{ 329{
332 pt_element_t gpte; 330 pt_element_t gpte;
333 unsigned pte_access; 331 unsigned pte_access;
@@ -339,16 +337,16 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
339 337
340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 338 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 339 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 340 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
341 if (is_error_pfn(pfn)) {
342 kvm_release_pfn_clean(pfn);
343 return; 343 return;
344 pfn = vcpu->arch.update_pte.pfn; 344 }
345 if (is_error_pfn(pfn)) 345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
348 return; 346 return;
349 kvm_get_pfn(pfn); 347
350 /* 348 /*
351 * we call mmu_set_spte() with host_writable = true beacuse that 349 * we call mmu_set_spte() with host_writable = true because that
352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 350 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
353 */ 351 */
354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 352 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
829#undef FNAME 827#undef FNAME
830#undef PT_BASE_ADDR_MASK 828#undef PT_BASE_ADDR_MASK
831#undef PT_INDEX 829#undef PT_INDEX
832#undef PT_LEVEL_MASK
833#undef PT_LVL_ADDR_MASK 830#undef PT_LVL_ADDR_MASK
834#undef PT_LVL_OFFSET_MASK 831#undef PT_LVL_OFFSET_MASK
835#undef PT_LEVEL_BITS 832#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 63fec1531e89..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
135 135
136 u32 *msrpm; 136 u32 *msrpm;
137 137
138 ulong nmi_iret_rip;
139
138 struct nested_state nested; 140 struct nested_state nested;
139 141
140 bool nmi_singlestep; 142 bool nmi_singlestep;
@@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1153 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1155 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1154 load_gs_index(svm->host.gs); 1156 load_gs_index(svm->host.gs);
1155#else 1157#else
1158#ifdef CONFIG_X86_32_LAZY_GS
1156 loadsegment(gs, svm->host.gs); 1159 loadsegment(gs, svm->host.gs);
1157#endif 1160#endif
1161#endif
1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1162 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1163 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1160} 1164}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
2653 ++svm->vcpu.stat.nmi_window_exits; 2657 ++svm->vcpu.stat.nmi_window_exits;
2654 clr_intercept(svm, INTERCEPT_IRET); 2658 clr_intercept(svm, INTERCEPT_IRET);
2655 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2659 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2656 return 1; 2661 return 1;
2657} 2662}
2658 2663
@@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3474 3479
3475 svm->int3_injected = 0; 3480 svm->int3_injected = 0;
3476 3481
3477 if (svm->vcpu.arch.hflags & HF_IRET_MASK) { 3482 /*
3483 * If we've made progress since setting HF_IRET_MASK, we've
3484 * executed an IRET and can allow NMI injection.
3485 */
3486 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3478 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3488 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3479 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3489 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3480 } 3490 }
@@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3641 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 3651 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3642#else 3652#else
3643 loadsegment(fs, svm->host.fs); 3653 loadsegment(fs, svm->host.fs);
3654#ifndef CONFIG_X86_32_LAZY_GS
3655 loadsegment(gs, svm->host.gs);
3656#endif
3644#endif 3657#endif
3645 3658
3646 reload_tss(vcpu); 3659 reload_tss(vcpu);
3647 3660
3648 local_irq_disable(); 3661 local_irq_disable();
3649 3662
3650 stgi();
3651
3652 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3663 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3653 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3664 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3654 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3665 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3655 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3666 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3656 3667
3668 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669 kvm_before_handle_nmi(&svm->vcpu);
3670
3671 stgi();
3672
3673 /* Any pending NMI will happen here */
3674
3675 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676 kvm_after_handle_nmi(&svm->vcpu);
3677
3657 sync_cr8_to_lapic(vcpu); 3678 sync_cr8_to_lapic(vcpu);
3658 3679
3659 svm->next_rip = 0; 3680 svm->next_rip = 0;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index fc7a101c4a35..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1357d7cf4ec8..db932760ea82 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..5b4cdcbd154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
93 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
94 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
95 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
96 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
97 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
98 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
99 * less than 2^12 cycles 99 * less than 2^12 cycles
100 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
101 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
102 */ 102 */
103#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
106module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 176 return container_of(vcpu, struct vcpu_vmx, vcpu);
177} 177}
178 178
179static int init_rmode(struct kvm *kvm);
180static u64 construct_eptp(unsigned long root_hpa); 179static u64 construct_eptp(unsigned long root_hpa);
181static void kvm_cpu_vmxon(u64 addr); 180static void kvm_cpu_vmxon(u64 addr);
182static void kvm_cpu_vmxoff(void); 181static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 182static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
183static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 184
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
1333 1333
1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335 if (msr & FEATURE_CONTROL_LOCKED) { 1335 if (msr & FEATURE_CONTROL_LOCKED) {
1336 /* launched w/ TXT and VMX disabled */
1336 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1337 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337 && tboot_enabled()) 1338 && tboot_enabled())
1338 return 1; 1339 return 1;
1340 /* launched w/o TXT and VMX only enabled w/ TXT */
1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1341 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1342 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1340 && !tboot_enabled()) { 1343 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 1344 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n"); 1345 "activate TXT before enabling KVM\n");
1343 return 1; 1346 return 1;
1344 } 1347 }
1348 /* launched w/o TXT and VMX disabled */
1349 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1350 && !tboot_enabled())
1351 return 1;
1345 } 1352 }
1346 1353
1347 return 0; 1354 return 0;
1348 /* locked but not enabled */
1349} 1355}
1350 1356
1351static void kvm_cpu_vmxon(u64 addr) 1357static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1683 vmx->emulation_required = 1; 1689 vmx->emulation_required = 1;
1684 vmx->rmode.vm86_active = 0; 1690 vmx->rmode.vm86_active = 0;
1685 1691
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1686 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1695 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1756 vmx->emulation_required = 1; 1763 vmx->emulation_required = 1;
1757 vmx->rmode.vm86_active = 1; 1764 vmx->rmode.vm86_active = 1;
1758 1765
1766 /*
1767 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1768 * vcpu. Call it here with phys address pointing 16M below 4G.
1769 */
1770 if (!vcpu->kvm->arch.tss_addr) {
1771 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1772 "called before entering vcpu\n");
1773 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1774 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 }
1777
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1759 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761 1781
@@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1794 1814
1795continue_rmode: 1815continue_rmode:
1796 kvm_mmu_reset_context(vcpu); 1816 kvm_mmu_reset_context(vcpu);
1797 init_rmode(vcpu->kvm);
1798} 1817}
1799 1818
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1819static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2030 vmcs_writel(GUEST_CR4, hw_cr4); 2049 vmcs_writel(GUEST_CR4, hw_cr4);
2031} 2050}
2032 2051
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037 return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu, 2052static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041 struct kvm_segment *var, int seg) 2053 struct kvm_segment *var, int seg)
2042{ 2054{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu);
2043 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save;
2044 u32 ar; 2058 u32 ar;
2045 2059
2060 if (vmx->rmode.vm86_active
2061 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2062 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2063 || seg == VCPU_SREG_GS)
2064 && !emulate_invalid_guest_state) {
2065 switch (seg) {
2066 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2067 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2068 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2069 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2070 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2071 default: BUG();
2072 }
2073 var->selector = save->selector;
2074 var->base = save->base;
2075 var->limit = save->limit;
2076 ar = save->ar;
2077 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector))
2079 goto use_saved_rmode_seg;
2080 }
2046 var->base = vmcs_readl(sf->base); 2081 var->base = vmcs_readl(sf->base);
2047 var->limit = vmcs_read32(sf->limit); 2082 var->limit = vmcs_read32(sf->limit);
2048 var->selector = vmcs_read16(sf->selector); 2083 var->selector = vmcs_read16(sf->selector);
2049 ar = vmcs_read32(sf->ar_bytes); 2084 ar = vmcs_read32(sf->ar_bytes);
2085use_saved_rmode_seg:
2050 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051 ar = 0; 2087 ar = 0;
2052 var->type = ar & 15; 2088 var->type = ar & 15;
@@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2060 var->unusable = (ar >> 16) & 1; 2096 var->unusable = (ar >> 16) & 1;
2061} 2097}
2062 2098
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s;
2103
2104 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg);
2106 return s.base;
2107 }
2108 return vmcs_readl(sf->base);
2109}
2110
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2111static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{ 2112{
2065 if (!is_protmode(vcpu)) 2113 if (!is_protmode(vcpu))
@@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2101 u32 ar; 2149 u32 ar;
2102 2150
2103 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector);
2104 vmx->rmode.tr.selector = var->selector; 2153 vmx->rmode.tr.selector = var->selector;
2105 vmx->rmode.tr.base = var->base; 2154 vmx->rmode.tr.base = var->base;
2106 vmx->rmode.tr.limit = var->limit; 2155 vmx->rmode.tr.limit = var->limit;
@@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2361 2410
2362static int init_rmode_tss(struct kvm *kvm) 2411static int init_rmode_tss(struct kvm *kvm)
2363{ 2412{
2364 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2413 gfn_t fn;
2365 u16 data = 0; 2414 u16 data = 0;
2366 int ret = 0; 2415 int r, idx, ret = 0;
2367 int r;
2368 2416
2417 idx = srcu_read_lock(&kvm->srcu);
2418 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2369 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2419 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370 if (r < 0) 2420 if (r < 0)
2371 goto out; 2421 goto out;
@@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
2389 2439
2390 ret = 1; 2440 ret = 1;
2391out: 2441out:
2442 srcu_read_unlock(&kvm->srcu, idx);
2392 return ret; 2443 return ret;
2393} 2444}
2394 2445
2395static int init_rmode_identity_map(struct kvm *kvm) 2446static int init_rmode_identity_map(struct kvm *kvm)
2396{ 2447{
2397 int i, r, ret; 2448 int i, idx, r, ret;
2398 pfn_t identity_map_pfn; 2449 pfn_t identity_map_pfn;
2399 u32 tmp; 2450 u32 tmp;
2400 2451
@@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2409 return 1; 2460 return 1;
2410 ret = 0; 2461 ret = 0;
2411 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2462 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2463 idx = srcu_read_lock(&kvm->srcu);
2412 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2464 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413 if (r < 0) 2465 if (r < 0)
2414 goto out; 2466 goto out;
@@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2424 kvm->arch.ept_identity_pagetable_done = true; 2476 kvm->arch.ept_identity_pagetable_done = true;
2425 ret = 1; 2477 ret = 1;
2426out: 2478out:
2479 srcu_read_unlock(&kvm->srcu, idx);
2427 return ret; 2480 return ret;
2428} 2481}
2429 2482
@@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2699 return 0; 2752 return 0;
2700} 2753}
2701 2754
2702static int init_rmode(struct kvm *kvm)
2703{
2704 int idx, ret = 0;
2705
2706 idx = srcu_read_lock(&kvm->srcu);
2707 if (!init_rmode_tss(kvm))
2708 goto exit;
2709 if (!init_rmode_identity_map(kvm))
2710 goto exit;
2711
2712 ret = 1;
2713exit:
2714 srcu_read_unlock(&kvm->srcu, idx);
2715 return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2755static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{ 2756{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2757 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2722 int ret; 2759 int ret;
2723 2760
2724 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2761 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725 if (!init_rmode(vmx->vcpu.kvm)) {
2726 ret = -ENOMEM;
2727 goto out;
2728 }
2729 2762
2730 vmx->rmode.vm86_active = 0; 2763 vmx->rmode.vm86_active = 0;
2731 2764
@@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2839 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2841 __pa(vmx->vcpu.arch.apic->regs));
2809 vmcs_write32(TPR_THRESHOLD, 0); 2842 vmcs_write32(TPR_THRESHOLD, 0);
2810 } 2843 }
2811 2844
@@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2971 if (ret) 3004 if (ret)
2972 return ret; 3005 return ret;
2973 kvm->arch.tss_addr = addr; 3006 kvm->arch.tss_addr = addr;
3007 if (!init_rmode_tss(kvm))
3008 return -ENOMEM;
3009
2974 return 0; 3010 return 0;
2975} 3011}
2976 3012
@@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3962#define Q "l" 3998#define Q "l"
3963#endif 3999#endif
3964 4000
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4001static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{ 4002{
3967 struct vcpu_vmx *vmx = to_vmx(vcpu); 4003 struct vcpu_vmx *vmx = to_vmx(vcpu);
3968 4004
@@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3991 asm( 4027 asm(
3992 /* Store host registers */ 4028 /* Store host registers */
3993 "push %%"R"dx; push %%"R"bp;" 4029 "push %%"R"dx; push %%"R"bp;"
4030 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3994 "push %%"R"cx \n\t" 4031 "push %%"R"cx \n\t"
3995 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4032 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996 "je 1f \n\t" 4033 "je 1f \n\t"
@@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4069 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033 ".Lkvm_vmx_return: " 4070 ".Lkvm_vmx_return: "
4034 /* Save guest registers, load host registers, keep flags */ 4071 /* Save guest registers, load host registers, keep flags */
4035 "xchg %0, (%%"R"sp) \n\t" 4072 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4073 "pop %0 \n\t"
4036 "mov %%"R"ax, %c[rax](%0) \n\t" 4074 "mov %%"R"ax, %c[rax](%0) \n\t"
4037 "mov %%"R"bx, %c[rbx](%0) \n\t" 4075 "mov %%"R"bx, %c[rbx](%0) \n\t"
4038 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4076 "pop"Q" %c[rcx](%0) \n\t"
4039 "mov %%"R"dx, %c[rdx](%0) \n\t" 4077 "mov %%"R"dx, %c[rdx](%0) \n\t"
4040 "mov %%"R"si, %c[rsi](%0) \n\t" 4078 "mov %%"R"si, %c[rsi](%0) \n\t"
4041 "mov %%"R"di, %c[rdi](%0) \n\t" 4079 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4053 "mov %%cr2, %%"R"ax \n\t" 4091 "mov %%cr2, %%"R"ax \n\t"
4054 "mov %%"R"ax, %c[cr2](%0) \n\t" 4092 "mov %%"R"ax, %c[cr2](%0) \n\t"
4055 4093
4056 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4094 "pop %%"R"bp; pop %%"R"dx \n\t"
4057 "setbe %c[fail](%0) \n\t" 4095 "setbe %c[fail](%0) \n\t"
4058 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4096 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4097 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4076 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4114 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4115 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif 4116#endif
4079 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4117 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4118 [wordsize]"i"(sizeof(ulong))
4080 : "cc", "memory" 4119 : "cc", "memory"
4081 , R"ax", R"bx", R"di", R"si" 4120 , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64 4121#ifdef CONFIG_X86_64
@@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4183 if (!kvm->arch.ept_identity_map_addr) 4222 if (!kvm->arch.ept_identity_map_addr)
4184 kvm->arch.ept_identity_map_addr = 4223 kvm->arch.ept_identity_map_addr =
4185 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4224 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4225 err = -ENOMEM;
4186 if (alloc_identity_pagetable(kvm) != 0) 4226 if (alloc_identity_pagetable(kvm) != 0)
4187 goto free_vmcs; 4227 goto free_vmcs;
4228 if (!init_rmode_identity_map(kvm))
4229 goto free_vmcs;
4188 } 4230 }
4189 4231
4190 return &vmx->vcpu; 4232 return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..58f517b59645 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
81 * - enable LME and LMA per default on 64 bit KVM 81 * - enable LME and LMA per default on 64 bit KVM
82 */ 82 */
83#ifdef CONFIG_X86_64 83#ifdef CONFIG_X86_64
84static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 84static
85u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
85#else 86#else
86static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
87#endif 88#endif
88 89
89#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
360 361
361void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362void kvm_inject_nmi(struct kvm_vcpu *vcpu)
362{ 363{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
363 kvm_make_request(KVM_REQ_EVENT, vcpu); 365 kvm_make_request(KVM_REQ_EVENT, vcpu);
364 vcpu->arch.nmi_pending = 1;
365} 366}
366EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367EXPORT_SYMBOL_GPL(kvm_inject_nmi);
367 368
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
525 526
526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 kvm_x86_ops->set_cr0(vcpu, cr0);
527 528
528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
529 kvm_clear_async_pf_completion_queue(vcpu); 530 kvm_clear_async_pf_completion_queue(vcpu);
531 kvm_async_pf_hash_reset(vcpu);
532 }
530 533
531 if ((cr0 ^ old_cr0) & update_bits) 534 if ((cr0 ^ old_cr0) & update_bits)
532 kvm_mmu_reset_context(vcpu); 535 kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1017 unsigned long flags; 1020 unsigned long flags;
1018 s64 sdiff; 1021 s64 sdiff;
1019 1022
1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1021 offset = data - native_read_tsc(); 1024 offset = data - native_read_tsc();
1022 ns = get_kernel_ns(); 1025 ns = get_kernel_ns();
1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1026 elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1028,7 +1031,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1028 /* 1031 /*
1029 * Special case: close write to TSC within 5 seconds of 1032 * Special case: close write to TSC within 5 seconds of
1030 * another CPU is interpreted as an attempt to synchronize 1033 * another CPU is interpreted as an attempt to synchronize
1031 * The 5 seconds is to accomodate host load / swapping as 1034 * The 5 seconds is to accommodate host load / swapping as
1032 * well as any reset of TSC during the boot process. 1035 * well as any reset of TSC during the boot process.
1033 * 1036 *
1034 * In that case, for a reliable TSC, we can match TSC offsets, 1037 * In that case, for a reliable TSC, we can match TSC offsets,
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1050 kvm->arch.last_tsc_write = data; 1053 kvm->arch.last_tsc_write = data;
1051 kvm->arch.last_tsc_offset = offset; 1054 kvm->arch.last_tsc_offset = offset;
1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1055 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1056 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1054 1057
1055 /* Reset of TSC must disable overshoot protection below */ 1058 /* Reset of TSC must disable overshoot protection below */
1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1059 vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1453 return 0; 1456 return 0;
1454} 1457}
1455 1458
1459static void kvmclock_reset(struct kvm_vcpu *vcpu)
1460{
1461 if (vcpu->arch.time_page) {
1462 kvm_release_page_dirty(vcpu->arch.time_page);
1463 vcpu->arch.time_page = NULL;
1464 }
1465}
1466
1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1467int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1457{ 1468{
1458 switch (msr) { 1469 switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1510 break; 1521 break;
1511 case MSR_KVM_SYSTEM_TIME_NEW: 1522 case MSR_KVM_SYSTEM_TIME_NEW:
1512 case MSR_KVM_SYSTEM_TIME: { 1523 case MSR_KVM_SYSTEM_TIME: {
1513 if (vcpu->arch.time_page) { 1524 kvmclock_reset(vcpu);
1514 kvm_release_page_dirty(vcpu->arch.time_page);
1515 vcpu->arch.time_page = NULL;
1516 }
1517 1525
1518 vcpu->arch.time = data; 1526 vcpu->arch.time = data;
1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1527 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1592 } else 1600 } else
1593 return set_msr_hyperv(vcpu, msr, data); 1601 return set_msr_hyperv(vcpu, msr, data);
1594 break; 1602 break;
1603 case MSR_IA32_BBL_CR_CTL3:
1604 /* Drop writes to this legacy MSR -- see rdmsr
1605 * counterpart for further detail.
1606 */
1607 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1608 break;
1595 default: 1609 default:
1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1610 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1597 return xen_hvm_config(vcpu, data); 1611 return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 } else 1860 } else
1847 return get_msr_hyperv(vcpu, msr, pdata); 1861 return get_msr_hyperv(vcpu, msr, pdata);
1848 break; 1862 break;
1863 case MSR_IA32_BBL_CR_CTL3:
1864 /* This legacy MSR exists but isn't fully documented in current
1865 * silicon. It is however accessed by winxp in very narrow
1866 * scenarios where it sets bit #19, itself documented as
1867 * a "reserved" bit. Best effort attempt to source coherent
1868 * read data here should the balance of the register be
1869 * interpreted by the guest:
1870 *
1871 * L2 cache control register 3: 64GB range, 256KB size,
1872 * enabled, latency 0x1, configured
1873 */
1874 data = 0xbe702111;
1875 break;
1849 default: 1876 default:
1850 if (!ignore_msrs) { 1877 if (!ignore_msrs) {
1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2100 if (check_tsc_unstable()) { 2127 if (check_tsc_unstable()) {
2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2128 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2102 vcpu->arch.tsc_catchup = 1; 2129 vcpu->arch.tsc_catchup = 1;
2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2104 } 2130 }
2131 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2105 if (vcpu->cpu != cpu) 2132 if (vcpu->cpu != cpu)
2106 kvm_migrate_timers(vcpu); 2133 kvm_migrate_timers(vcpu);
2107 vcpu->cpu = cpu; 2134 vcpu->cpu = cpu;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2575 if (mce->status & MCI_STATUS_UC) { 2602 if (mce->status & MCI_STATUS_UC) {
2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2603 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2604 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2578 printk(KERN_DEBUG "kvm: set_mce: "
2579 "injects mce exception while "
2580 "previous one is in progress!\n");
2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2605 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2582 return 0; 2606 return 0;
2583 } 2607 }
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2672 vcpu->arch.interrupt.pending = events->interrupt.injected;
2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2673 vcpu->arch.interrupt.nr = events->interrupt.nr;
2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2674 vcpu->arch.interrupt.soft = events->interrupt.soft;
2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2652 kvm_pic_clear_isr_ack(vcpu->kvm);
2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2675 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2676 kvm_x86_ops->set_interrupt_shadow(vcpu,
2655 events->interrupt.shadow); 2677 events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
4140 return get_segment_base(vcpu, seg); 4162 return get_segment_base(vcpu, seg);
4141} 4163}
4142 4164
4143static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4144 struct kvm_vcpu *vcpu) 4166 int seg, struct kvm_vcpu *vcpu)
4145{ 4167{
4146 struct kvm_segment var; 4168 struct kvm_segment var;
4147 4169
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4154 var.limit >>= 12; 4176 var.limit >>= 12;
4155 set_desc_limit(desc, var.limit); 4177 set_desc_limit(desc, var.limit);
4156 set_desc_base(desc, (unsigned long)var.base); 4178 set_desc_base(desc, (unsigned long)var.base);
4179#ifdef CONFIG_X86_64
4180 if (base3)
4181 *base3 = var.base >> 32;
4182#endif
4157 desc->type = var.type; 4183 desc->type = var.type;
4158 desc->s = var.s; 4184 desc->s = var.s;
4159 desc->dpl = var.dpl; 4185 desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4166 return true; 4192 return true;
4167} 4193}
4168 4194
4169static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4170 struct kvm_vcpu *vcpu) 4196 int seg, struct kvm_vcpu *vcpu)
4171{ 4197{
4172 struct kvm_segment var; 4198 struct kvm_segment var;
4173 4199
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
4175 kvm_get_segment(vcpu, &var, seg); 4201 kvm_get_segment(vcpu, &var, seg);
4176 4202
4177 var.base = get_desc_base(desc); 4203 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32;
4206#endif
4178 var.limit = get_desc_limit(desc); 4207 var.limit = get_desc_limit(desc);
4179 if (desc->g) 4208 if (desc->g)
4180 var.limit = (var.limit << 12) | 0xfff; 4209 var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4390 vcpu->arch.emulate_ctxt.have_exception = false; 4419 vcpu->arch.emulate_ctxt.have_exception = false;
4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4420 vcpu->arch.emulate_ctxt.perm_ok = false;
4392 4421
4422 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4423 = emulation_type & EMULTYPE_TRAP_UD;
4424
4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4425 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4394 if (r == X86EMUL_PROPAGATE_FAULT)
4395 goto done;
4396 4426
4397 trace_kvm_emulate_insn_start(vcpu); 4427 trace_kvm_emulate_insn_start(vcpu);
4398
4399 /* Only allow emulation of specific instructions on #UD
4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/
4401 if (emulation_type & EMULTYPE_TRAP_UD) {
4402 if (!c->twobyte)
4403 return EMULATE_FAIL;
4404 switch (c->b) {
4405 case 0x01: /* VMMCALL */
4406 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4407 return EMULATE_FAIL;
4408 break;
4409 case 0x34: /* sysenter */
4410 case 0x35: /* sysexit */
4411 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4412 return EMULATE_FAIL;
4413 break;
4414 case 0x05: /* syscall */
4415 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4416 return EMULATE_FAIL;
4417 break;
4418 default:
4419 return EMULATE_FAIL;
4420 }
4421
4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
4423 return EMULATE_FAIL;
4424 }
4425
4426 ++vcpu->stat.insn_emulation; 4428 ++vcpu->stat.insn_emulation;
4427 if (r) { 4429 if (r) {
4430 if (emulation_type & EMULTYPE_TRAP_UD)
4431 return EMULATE_FAIL;
4428 if (reexecute_instruction(vcpu, cr2)) 4432 if (reexecute_instruction(vcpu, cr2))
4429 return EMULATE_DONE; 4433 return EMULATE_DONE;
4430 if (emulation_type & EMULTYPE_SKIP) 4434 if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
4452 return handle_emulation_failure(vcpu); 4456 return handle_emulation_failure(vcpu);
4453 } 4457 }
4454 4458
4455done:
4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4459 if (vcpu->arch.emulate_ctxt.have_exception) {
4457 inject_emulated_exception(vcpu); 4460 inject_emulated_exception(vcpu);
4458 r = EMULATE_DONE; 4461 r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4562 4565
4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4566 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4564 4567
4565 spin_lock(&kvm_lock); 4568 raw_spin_lock(&kvm_lock);
4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4569 list_for_each_entry(kvm, &vm_list, vm_list) {
4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4570 kvm_for_each_vcpu(i, vcpu, kvm) {
4568 if (vcpu->cpu != freq->cpu) 4571 if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4572 send_ipi = 1; 4575 send_ipi = 1;
4573 } 4576 }
4574 } 4577 }
4575 spin_unlock(&kvm_lock); 4578 raw_spin_unlock(&kvm_lock);
4576 4579
4577 if (freq->old < freq->new && send_ipi) { 4580 if (freq->old < freq->new && send_ipi) {
4578 /* 4581 /*
@@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5185 r = 1; 5188 r = 1;
5186 goto out; 5189 goto out;
5187 } 5190 }
5191 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5192 vcpu->arch.nmi_pending = true;
5188 } 5193 }
5189 5194
5190 r = kvm_mmu_reload(vcpu); 5195 r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5213 kvm_load_guest_fpu(vcpu); 5218 kvm_load_guest_fpu(vcpu);
5214 kvm_load_guest_xcr0(vcpu); 5219 kvm_load_guest_xcr0(vcpu);
5215 5220
5216 atomic_set(&vcpu->guest_mode, 1); 5221 vcpu->mode = IN_GUEST_MODE;
5217 smp_wmb(); 5222
5223 /* We should set ->mode before check ->requests,
5224 * see the comment in make_all_cpus_request.
5225 */
5226 smp_mb();
5218 5227
5219 local_irq_disable(); 5228 local_irq_disable();
5220 5229
5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5230 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5222 || need_resched() || signal_pending(current)) { 5231 || need_resched() || signal_pending(current)) {
5223 atomic_set(&vcpu->guest_mode, 0); 5232 vcpu->mode = OUTSIDE_GUEST_MODE;
5224 smp_wmb(); 5233 smp_wmb();
5225 local_irq_enable(); 5234 local_irq_enable();
5226 preempt_enable(); 5235 preempt_enable();
@@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5256 5265
5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5266 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5258 5267
5259 atomic_set(&vcpu->guest_mode, 0); 5268 vcpu->mode = OUTSIDE_GUEST_MODE;
5260 smp_wmb(); 5269 smp_wmb();
5261 local_irq_enable(); 5270 local_irq_enable();
5262 5271
@@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5574 struct kvm_sregs *sregs) 5583 struct kvm_sregs *sregs)
5575{ 5584{
5576 int mmu_reset_needed = 0; 5585 int mmu_reset_needed = 0;
5577 int pending_vec, max_bits; 5586 int pending_vec, max_bits, idx;
5578 struct desc_ptr dt; 5587 struct desc_ptr dt;
5579 5588
5580 dt.size = sregs->idt.limit; 5589 dt.size = sregs->idt.limit;
@@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5612 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5613 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu); 5614 update_cpuid(vcpu);
5615
5616 idx = srcu_read_lock(&vcpu->kvm->srcu);
5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5617 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5618 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5608 mmu_reset_needed = 1; 5619 mmu_reset_needed = 1;
5609 } 5620 }
5621 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5610 5622
5611 if (mmu_reset_needed) 5623 if (mmu_reset_needed)
5612 kvm_mmu_reset_context(vcpu); 5624 kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5617 if (pending_vec < max_bits) { 5629 if (pending_vec < max_bits) {
5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5630 kvm_queue_interrupt(vcpu, pending_vec, false);
5619 pr_debug("Set back pending irq %d\n", pending_vec); 5631 pr_debug("Set back pending irq %d\n", pending_vec);
5620 if (irqchip_in_kernel(vcpu->kvm))
5621 kvm_pic_clear_isr_ack(vcpu->kvm);
5622 } 5632 }
5623 5633
5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5634 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5814 5824
5815void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5825void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5816{ 5826{
5817 if (vcpu->arch.time_page) { 5827 kvmclock_reset(vcpu);
5818 kvm_release_page_dirty(vcpu->arch.time_page);
5819 vcpu->arch.time_page = NULL;
5820 }
5821 5828
5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5829 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5823 fx_free(vcpu); 5830 fx_free(vcpu);
@@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5885 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0; 5886 vcpu->arch.apf.msr_val = 0;
5880 5887
5888 kvmclock_reset(vcpu);
5889
5881 kvm_clear_async_pf_completion_queue(vcpu); 5890 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu); 5891 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false; 5892 vcpu->arch.apf.halted = false;
@@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6014 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6015 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6007 6016
6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6017 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6009 6018
6010 return 0; 6019 return 0;
6011} 6020}
@@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6103 int user_alloc) 6112 int user_alloc)
6104{ 6113{
6105 6114
6106 int npages = mem->memory_size >> PAGE_SHIFT; 6115 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6107 6116
6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6117 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6109 int ret; 6118 int ret;
@@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6118 "failed to munmap memory\n"); 6127 "failed to munmap memory\n");
6119 } 6128 }
6120 6129
6130 if (!kvm->arch.n_requested_mmu_pages)
6131 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6132
6121 spin_lock(&kvm->mmu_lock); 6133 spin_lock(&kvm->mmu_lock);
6122 if (!kvm->arch.n_requested_mmu_pages) { 6134 if (nr_mmu_pages)
6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6135 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6125 }
6126
6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6136 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6128 spin_unlock(&kvm->mmu_lock); 6137 spin_unlock(&kvm->mmu_lock);
6129} 6138}
@@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6157 6166
6158 me = get_cpu(); 6167 me = get_cpu();
6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6168 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6169 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6161 smp_send_reschedule(cpu); 6170 smp_send_reschedule(cpu);
6162 put_cpu(); 6171 put_cpu();
6163} 6172}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0cc0c..1cd608973ce5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void)
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 397 * instead we just use the real "cpuid" instruction. Then I pretty much turned
398 * off feature bits until the Guest booted. (Don't say that: you'll damage 398 * off feature bits until the Guest booted. (Don't say that: you'll damage
399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
400 * hardly future proof.) Noone's listening! They don't like you anyway, 400 * hardly future proof.) No one's listening! They don't like you anyway,
401 * parenthetic weirdo! 401 * parenthetic weirdo!
402 * 402 *
403 * Replacing the cpuid so we can turn features off is great for the kernel, but 403 * Replacing the cpuid so we can turn features off is great for the kernel, but
@@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void)
847void lguest_setup_irq(unsigned int irq) 847void lguest_setup_irq(unsigned int irq)
848{ 848{
849 irq_alloc_desc_at(irq, 0); 849 irq_alloc_desc_at(irq, 0);
850 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 850 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
851 handle_level_irq, "level"); 851 handle_level_irq, "level");
852} 852}
853 853
@@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
995static void lguest_time_init(void) 995static void lguest_time_init(void)
996{ 996{
997 /* Set up the timer interrupt (0) to go to our simple timer routine */ 997 /* Set up the timer interrupt (0) to go to our simple timer routine */
998 set_irq_handler(0, lguest_time_irq); 998 irq_set_handler(0, lguest_time_irq);
999 999
1000 clocksource_register(&lguest_clock); 1000 clocksource_register(&lguest_clock);
1001 1001
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
42 lib-y += memmove_64.o memset_64.o 42 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o 44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o
45endif 46endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e65..e8e7e0d06f42 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
15 15
16/* if you want SMP support, implement these with real spinlocks */ 16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg 17.macro LOCK reg
18 pushfl 18 pushfl_cfi
19 CFI_ADJUST_CFA_OFFSET 4
20 cli 19 cli
21.endm 20.endm
22 21
23.macro UNLOCK reg 22.macro UNLOCK reg
24 popfl 23 popfl_cfi
25 CFI_ADJUST_CFA_OFFSET -4
26.endm 24.endm
27 25
28#define BEGIN(op) \ 26#define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de3352..391a083674b4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg 16.macro SAVE reg
17 pushl %\reg 17 pushl_cfi %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0 18 CFI_REL_OFFSET \reg, 0
20.endm 19.endm
21 20
22.macro RESTORE reg 21.macro RESTORE reg
23 popl %\reg 22 popl_cfi %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg 23 CFI_RESTORE \reg
26.endm 24.endm
27 25
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..78d16a554db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
50 */ 50 */
51ENTRY(csum_partial) 51ENTRY(csum_partial)
52 CFI_STARTPROC 52 CFI_STARTPROC
53 pushl %esi 53 pushl_cfi %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0 54 CFI_REL_OFFSET esi, 0
56 pushl %ebx 55 pushl_cfi %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0 56 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum 57 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len 58 movl 16(%esp),%ecx # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
132 jz 8f 130 jz 8f
133 roll $8, %eax 131 roll $8, %eax
1348: 1328:
135 popl %ebx 133 popl_cfi %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx 134 CFI_RESTORE ebx
138 popl %esi 135 popl_cfi %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi 136 CFI_RESTORE esi
141 ret 137 ret
142 CFI_ENDPROC 138 CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
148 144
149ENTRY(csum_partial) 145ENTRY(csum_partial)
150 CFI_STARTPROC 146 CFI_STARTPROC
151 pushl %esi 147 pushl_cfi %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0 148 CFI_REL_OFFSET esi, 0
154 pushl %ebx 149 pushl_cfi %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0 150 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum 151 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len 152 movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
260 jz 90f 254 jz 90f
261 roll $8, %eax 255 roll $8, %eax
26290: 25690:
263 popl %ebx 257 popl_cfi %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx 258 CFI_RESTORE ebx
266 popl %esi 259 popl_cfi %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi 260 CFI_RESTORE esi
269 ret 261 ret
270 CFI_ENDPROC 262 CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC 301 CFI_STARTPROC
310 subl $4,%esp 302 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4 303 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi 304 pushl_cfi %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0 305 CFI_REL_OFFSET edi, 0
315 pushl %esi 306 pushl_cfi %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0 307 CFI_REL_OFFSET esi, 0
318 pushl %ebx 308 pushl_cfi %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0 309 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum 310 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len 311 movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) )
426 415
427.previous 416.previous
428 417
429 popl %ebx 418 popl_cfi %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx 419 CFI_RESTORE ebx
432 popl %esi 420 popl_cfi %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi 421 CFI_RESTORE esi
435 popl %edi 422 popl_cfi %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi 423 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp 424 popl_cfi %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret 425 ret
441 CFI_ENDPROC 426 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic) 427ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
459 444
460ENTRY(csum_partial_copy_generic) 445ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC 446 CFI_STARTPROC
462 pushl %ebx 447 pushl_cfi %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0 448 CFI_REL_OFFSET ebx, 0
465 pushl %edi 449 pushl_cfi %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0 450 CFI_REL_OFFSET edi, 0
468 pushl %esi 451 pushl_cfi %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0 452 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src 453 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst 454 movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) )
527 jmp 7b 509 jmp 7b
528.previous 510.previous
529 511
530 popl %esi 512 popl_cfi %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi 513 CFI_RESTORE esi
533 popl %edi 514 popl_cfi %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi 515 CFI_RESTORE edi
536 popl %ebx 516 popl_cfi %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx 517 CFI_RESTORE ebx
539 ret 518 ret
540 CFI_ENDPROC 519 CFI_ENDPROC
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..3e8b08a6de2b
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,59 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8#include <linux/linkage.h>
9#include <asm/alternative-asm.h>
10#include <asm/frame.h>
11#include <asm/dwarf2.h>
12
13.text
14
15/*
16 * Inputs:
17 * %rsi : memory location to compare
18 * %rax : low 64 bits of old value
19 * %rdx : high 64 bits of old value
20 * %rbx : low 64 bits of new value
21 * %rcx : high 64 bits of new value
22 * %al : Operation successful
23 */
24ENTRY(this_cpu_cmpxchg16b_emu)
25CFI_STARTPROC
26
27#
28# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
29# via the ZF. Caller will access %al to get result.
30#
31# Note that this is only useful for a cpuops operation. Meaning that we
32# do *not* have a fully atomic operation but just an operation that is
33# *atomic* on a single cpu (as provided by the this_cpu_xx class of
34# macros).
35#
36this_cpu_cmpxchg16b_emu:
37 pushf
38 cli
39
40 cmpq %gs:(%rsi), %rax
41 jne not_same
42 cmpq %gs:8(%rsi), %rdx
43 jne not_same
44
45 movq %rbx, %gs:(%rsi)
46 movq %rcx, %gs:8(%rsi)
47
48 popf
49 mov $1, %al
50 ret
51
52 not_same:
53 popf
54 xor %al,%al
55 ret
56
57CFI_ENDPROC
58
59ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..99e482615195 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -117,7 +117,7 @@ ENDPROC(bad_from_user)
117 * rdx count 117 * rdx count
118 * 118 *
119 * Output: 119 * Output:
120 * eax uncopied bytes or 0 if successfull. 120 * eax uncopied bytes or 0 if successful.
121 */ 121 */
122ENTRY(copy_user_generic_unrolled) 122ENTRY(copy_user_generic_unrolled)
123 CFI_STARTPROC 123 CFI_STARTPROC
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 * 3 *
4 * This file is subject to the terms and conditions of the GNU General Public 4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
11 11
12/* 12/*
13 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed. 15 * destination is zeroed.
16 * 16 *
17 * Input 17 * Input
18 * rdi source 18 * rdi source
19 * rsi destination 19 * rsi destination
20 * edx len (32bit) 20 * edx len (32bit)
21 * ecx sum (32bit) 21 * ecx sum (32bit)
22 * r8 src_err_ptr (int) 22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int) 23 * r9 dst_err_ptr (int)
24 * 24 *
25 * Output 25 * Output
26 * eax 64bit sum. undefined in case of exception. 26 * eax 64bit sum. undefined in case of exception.
27 * 27 *
28 * Wrappers need to take care of valid exception sum and zeroing. 28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes. 29 * They also should align source or destination to 8 bytes.
30 */ 30 */
31 31
32 .macro source 32 .macro source
3310: 3310:
34 .section __ex_table,"a" 34 .section __ex_table, "a"
35 .align 8 35 .align 8
36 .quad 10b,.Lbad_source 36 .quad 10b, .Lbad_source
37 .previous 37 .previous
38 .endm 38 .endm
39 39
40 .macro dest 40 .macro dest
4120: 4120:
42 .section __ex_table,"a" 42 .section __ex_table, "a"
43 .align 8 43 .align 8
44 .quad 20b,.Lbad_dest 44 .quad 20b, .Lbad_dest
45 .previous 45 .previous
46 .endm 46 .endm
47 47
48 .macro ignore L=.Lignore 48 .macro ignore L=.Lignore
4930: 4930:
50 .section __ex_table,"a" 50 .section __ex_table, "a"
51 .align 8 51 .align 8
52 .quad 30b,\L 52 .quad 30b, \L
53 .previous 53 .previous
54 .endm 54 .endm
55 55
56 56
57ENTRY(csum_partial_copy_generic) 57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC 58 CFI_STARTPROC
59 cmpl $3*64,%edx 59 cmpl $3*64, %edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8, %rsp
64 CFI_ADJUST_CFA_OFFSET 7*8 64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp) 65 movq %rbx, 2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8 66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp) 67 movq %r12, 3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8 68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp) 69 movq %r14, 4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8 70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp) 71 movq %r13, 5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8 72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp) 73 movq %rbp, 6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8 74 CFI_REL_OFFSET rbp, 6*8
75 75
76 movq %r8,(%rsp) 76 movq %r8, (%rsp)
77 movq %r9,1*8(%rsp) 77 movq %r9, 1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81 78
82 xorl %r9d,%r9d 79 movl %ecx, %eax
83 movq %rcx,%r12 80 movl %edx, %ecx
84 81
85 shrq $6,%r12 82 xorl %r9d, %r9d
86 jz .Lhandle_tail /* < 64 */ 83 movq %rcx, %r12
84
85 shrq $6, %r12
86 jz .Lhandle_tail /* < 64 */
87 87
88 clc 88 clc
89 89
90 /* main loop. clear in 64 byte blocks */ 90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ 91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */ 92 /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
94 .p2align 4 94 .p2align 4
95.Lloop: 95.Lloop:
96 source 96 source
97 movq (%rdi),%rbx 97 movq (%rdi), %rbx
98 source 98 source
99 movq 8(%rdi),%r8 99 movq 8(%rdi), %r8
100 source 100 source
101 movq 16(%rdi),%r11 101 movq 16(%rdi), %r11
102 source 102 source
103 movq 24(%rdi),%rdx 103 movq 24(%rdi), %rdx
104 104
105 source 105 source
106 movq 32(%rdi),%r10 106 movq 32(%rdi), %r10
107 source 107 source
108 movq 40(%rdi),%rbp 108 movq 40(%rdi), %rbp
109 source 109 source
110 movq 48(%rdi),%r14 110 movq 48(%rdi), %r14
111 source 111 source
112 movq 56(%rdi),%r13 112 movq 56(%rdi), %r13
113 113
114 ignore 2f 114 ignore 2f
115 prefetcht0 5*64(%rdi) 115 prefetcht0 5*64(%rdi)
1162: 1162:
117 adcq %rbx,%rax 117 adcq %rbx, %rax
118 adcq %r8,%rax 118 adcq %r8, %rax
119 adcq %r11,%rax 119 adcq %r11, %rax
120 adcq %rdx,%rax 120 adcq %rdx, %rax
121 adcq %r10,%rax 121 adcq %r10, %rax
122 adcq %rbp,%rax 122 adcq %rbp, %rax
123 adcq %r14,%rax 123 adcq %r14, %rax
124 adcq %r13,%rax 124 adcq %r13, %rax
125 125
126 decl %r12d 126 decl %r12d
127 127
128 dest 128 dest
129 movq %rbx,(%rsi) 129 movq %rbx, (%rsi)
130 dest 130 dest
131 movq %r8,8(%rsi) 131 movq %r8, 8(%rsi)
132 dest 132 dest
133 movq %r11,16(%rsi) 133 movq %r11, 16(%rsi)
134 dest 134 dest
135 movq %rdx,24(%rsi) 135 movq %rdx, 24(%rsi)
136 136
137 dest 137 dest
138 movq %r10,32(%rsi) 138 movq %r10, 32(%rsi)
139 dest 139 dest
140 movq %rbp,40(%rsi) 140 movq %rbp, 40(%rsi)
141 dest 141 dest
142 movq %r14,48(%rsi) 142 movq %r14, 48(%rsi)
143 dest 143 dest
144 movq %r13,56(%rsi) 144 movq %r13, 56(%rsi)
145 145
1463: 1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150 147
151 jnz .Lloop 148 leaq 64(%rdi), %rdi
149 leaq 64(%rsi), %rsi
152 150
153 adcq %r9,%rax 151 jnz .Lloop
154 152
155 /* do last upto 56 bytes */ 153 adcq %r9, %rax
154
155 /* do last up to 56 bytes */
156.Lhandle_tail: 156.Lhandle_tail:
157 /* ecx: count */ 157 /* ecx: count */
158 movl %ecx,%r10d 158 movl %ecx, %r10d
159 andl $63,%ecx 159 andl $63, %ecx
160 shrl $3,%ecx 160 shrl $3, %ecx
161 jz .Lfold 161 jz .Lfold
162 clc 162 clc
163 .p2align 4 163 .p2align 4
164.Lloop_8: 164.Lloop_8:
165 source 165 source
166 movq (%rdi),%rbx 166 movq (%rdi), %rbx
167 adcq %rbx,%rax 167 adcq %rbx, %rax
168 decl %ecx 168 decl %ecx
169 dest 169 dest
170 movq %rbx,(%rsi) 170 movq %rbx, (%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */ 171 leaq 8(%rsi), %rsi /* preserve carry */
172 leaq 8(%rdi),%rdi 172 leaq 8(%rdi), %rdi
173 jnz .Lloop_8 173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */ 174 adcq %r9, %rax /* add in carry */
175 175
176.Lfold: 176.Lfold:
177 /* reduce checksum to 32bits */ 177 /* reduce checksum to 32bits */
178 movl %eax,%ebx 178 movl %eax, %ebx
179 shrq $32,%rax 179 shrq $32, %rax
180 addl %ebx,%eax 180 addl %ebx, %eax
181 adcl %r9d,%eax 181 adcl %r9d, %eax
182 182
183 /* do last upto 6 bytes */ 183 /* do last up to 6 bytes */
184.Lhandle_7: 184.Lhandle_7:
185 movl %r10d,%ecx 185 movl %r10d, %ecx
186 andl $7,%ecx 186 andl $7, %ecx
187 shrl $1,%ecx 187 shrl $1, %ecx
188 jz .Lhandle_1 188 jz .Lhandle_1
189 movl $2,%edx 189 movl $2, %edx
190 xorl %ebx,%ebx 190 xorl %ebx, %ebx
191 clc 191 clc
192 .p2align 4 192 .p2align 4
193.Lloop_1: 193.Lloop_1:
194 source 194 source
195 movw (%rdi),%bx 195 movw (%rdi), %bx
196 adcl %ebx,%eax 196 adcl %ebx, %eax
197 decl %ecx 197 decl %ecx
198 dest 198 dest
199 movw %bx,(%rsi) 199 movw %bx, (%rsi)
200 leaq 2(%rdi),%rdi 200 leaq 2(%rdi), %rdi
201 leaq 2(%rsi),%rsi 201 leaq 2(%rsi), %rsi
202 jnz .Lloop_1 202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */ 203 adcl %r9d, %eax /* add in carry */
204 204
205 /* handle last odd byte */ 205 /* handle last odd byte */
206.Lhandle_1: 206.Lhandle_1:
207 testl $1,%r10d 207 testl $1, %r10d
208 jz .Lende 208 jz .Lende
209 xorl %ebx,%ebx 209 xorl %ebx, %ebx
210 source 210 source
211 movb (%rdi),%bl 211 movb (%rdi), %bl
212 dest 212 dest
213 movb %bl,(%rsi) 213 movb %bl, (%rsi)
214 addl %ebx,%eax 214 addl %ebx, %eax
215 adcl %r9d,%eax /* carry */ 215 adcl %r9d, %eax /* carry */
216 216
217 CFI_REMEMBER_STATE 217 CFI_REMEMBER_STATE
218.Lende: 218.Lende:
219 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp), %rbx
220 CFI_RESTORE rbx 220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp), %r12
222 CFI_RESTORE r12 222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp), %r14
224 CFI_RESTORE r14 224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp), %r13
226 CFI_RESTORE r13 226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp), %rbp
228 CFI_RESTORE rbp 228 CFI_RESTORE rbp
229 addq $7*8,%rsp 229 addq $7*8, %rsp
230 CFI_ADJUST_CFA_OFFSET -7*8 230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret 231 ret
232 CFI_RESTORE_STATE 232 CFI_RESTORE_STATE
233 233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source: 235.Lbad_source:
236 movq (%rsp),%rax 236 movq (%rsp), %rax
237 testq %rax,%rax 237 testq %rax, %rax
238 jz .Lende 238 jz .Lende
239 movl $-EFAULT,(%rax) 239 movl $-EFAULT, (%rax)
240 jmp .Lende 240 jmp .Lende
241 241
242.Lbad_dest: 242.Lbad_dest:
243 movq 8(%rsp),%rax 243 movq 8(%rsp), %rax
244 testq %rax,%rax 244 testq %rax, %rax
245 jz .Lende 245 jz .Lende
246 movl $-EFAULT,(%rax) 246 movl $-EFAULT, (%rax)
247 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC 248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic) 249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
84 count64--; 84 count64--;
85 } 85 }
86 86
87 /* last upto 7 8byte blocks */ 87 /* last up to 7 8byte blocks */
88 count %= 8; 88 count %= 8;
89 while (count) { 89 while (count) {
90 asm("addq %1,%0\n\t" 90 asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..0ecb8433e5a8
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
12#undef memmove
13
14/*
15 * Implement memmove(). This can handle overlap between src and dst.
16 *
17 * Input:
18 * rdi: dest
19 * rsi: src
20 * rdx: count
21 *
22 * Output:
23 * rax: dest
24 */
25ENTRY(memmove)
26 CFI_STARTPROC
27 /* Handle more 32bytes in loop */
28 mov %rdi, %rax
29 cmp $0x20, %rdx
30 jb 1f
31
32 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi
34 jb 2f
35
36 /*
37 * movsq instruction have many startup latency
38 * so we handle small size by general register.
39 */
40 cmp $680, %rdx
41 jb 3f
42 /*
43 * movsq instruction is only good for aligned case.
44 */
45
46 cmpb %dil, %sil
47 je 4f
483:
49 sub $0x20, %rdx
50 /*
51 * We gobble 32byts forward in each loop.
52 */
535:
54 sub $0x20, %rdx
55 movq 0*8(%rsi), %r11
56 movq 1*8(%rsi), %r10
57 movq 2*8(%rsi), %r9
58 movq 3*8(%rsi), %r8
59 leaq 4*8(%rsi), %rsi
60
61 movq %r11, 0*8(%rdi)
62 movq %r10, 1*8(%rdi)
63 movq %r9, 2*8(%rdi)
64 movq %r8, 3*8(%rdi)
65 leaq 4*8(%rdi), %rdi
66 jae 5b
67 addq $0x20, %rdx
68 jmp 1f
69 /*
70 * Handle data forward by movsq.
71 */
72 .p2align 4
734:
74 movq %rdx, %rcx
75 movq -8(%rsi, %rdx), %r11
76 lea -8(%rdi, %rdx), %r10
77 shrq $3, %rcx
78 rep movsq
79 movq %r11, (%r10)
80 jmp 13f
81 /*
82 * Handle data backward by movsq.
83 */
84 .p2align 4
857:
86 movq %rdx, %rcx
87 movq (%rsi), %r11
88 movq %rdi, %r10
89 leaq -8(%rsi, %rdx), %rsi
90 leaq -8(%rdi, %rdx), %rdi
91 shrq $3, %rcx
92 std
93 rep movsq
94 cld
95 movq %r11, (%r10)
96 jmp 13f
97
98 /*
99 * Start to prepare for backward copy.
100 */
101 .p2align 4
1022:
103 cmp $680, %rdx
104 jb 6f
105 cmp %dil, %sil
106 je 7b
1076:
108 /*
109 * Calculate copy position to tail.
110 */
111 addq %rdx, %rsi
112 addq %rdx, %rdi
113 subq $0x20, %rdx
114 /*
115 * We gobble 32byts backward in each loop.
116 */
1178:
118 subq $0x20, %rdx
119 movq -1*8(%rsi), %r11
120 movq -2*8(%rsi), %r10
121 movq -3*8(%rsi), %r9
122 movq -4*8(%rsi), %r8
123 leaq -4*8(%rsi), %rsi
124
125 movq %r11, -1*8(%rdi)
126 movq %r10, -2*8(%rdi)
127 movq %r9, -3*8(%rdi)
128 movq %r8, -4*8(%rdi)
129 leaq -4*8(%rdi), %rdi
130 jae 8b
131 /*
132 * Calculate copy position to head.
133 */
134 addq $0x20, %rdx
135 subq %rdx, %rsi
136 subq %rdx, %rdi
1371:
138 cmpq $16, %rdx
139 jb 9f
140 /*
141 * Move data from 16 bytes to 31 bytes.
142 */
143 movq 0*8(%rsi), %r11
144 movq 1*8(%rsi), %r10
145 movq -2*8(%rsi, %rdx), %r9
146 movq -1*8(%rsi, %rdx), %r8
147 movq %r11, 0*8(%rdi)
148 movq %r10, 1*8(%rdi)
149 movq %r9, -2*8(%rdi, %rdx)
150 movq %r8, -1*8(%rdi, %rdx)
151 jmp 13f
152 .p2align 4
1539:
154 cmpq $8, %rdx
155 jb 10f
156 /*
157 * Move data from 8 bytes to 15 bytes.
158 */
159 movq 0*8(%rsi), %r11
160 movq -1*8(%rsi, %rdx), %r10
161 movq %r11, 0*8(%rdi)
162 movq %r10, -1*8(%rdi, %rdx)
163 jmp 13f
16410:
165 cmpq $4, %rdx
166 jb 11f
167 /*
168 * Move data from 4 bytes to 7 bytes.
169 */
170 movl (%rsi), %r11d
171 movl -4(%rsi, %rdx), %r10d
172 movl %r11d, (%rdi)
173 movl %r10d, -4(%rdi, %rdx)
174 jmp 13f
17511:
176 cmp $2, %rdx
177 jb 12f
178 /*
179 * Move data from 2 bytes to 3 bytes.
180 */
181 movw (%rsi), %r11w
182 movw -2(%rsi, %rdx), %r10w
183 movw %r11w, (%rdi)
184 movw %r10w, -2(%rdi, %rdx)
185 jmp 13f
18612:
187 cmp $1, %rdx
188 jb 13f
189 /*
190 * Move data for 1 byte.
191 */
192 movb (%rsi), %r11b
193 movb %r11b, (%rdi)
19413:
195 retq
196 CFI_ENDPROC
197ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b34..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void *dest, const void *src, size_t count)
10{
11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 char *ret;
13
14 __asm__ __volatile__(
15 /* Handle more 32bytes in loop */
16 "mov %2, %3\n\t"
17 "cmp $0x20, %0\n\t"
18 "jb 1f\n\t"
19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
191}
192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49df..67743977398b 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
23#include <asm/dwarf2.h> 23#include <asm/dwarf2.h>
24 24
25#define save_common_regs \ 25#define save_common_regs \
26 pushq %rdi; \ 26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq %rsi; \ 27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
28 pushq %rcx; \ 28 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
29 pushq %r8; \ 29 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
30 pushq %r9; \ 30 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
31 pushq %r10; \ 31 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
32 pushq %r11 32 pushq_cfi %r11; CFI_REL_OFFSET r11, 0
33 33
34#define restore_common_regs \ 34#define restore_common_regs \
35 popq %r11; \ 35 popq_cfi %r11; CFI_RESTORE r11; \
36 popq %r10; \ 36 popq_cfi %r10; CFI_RESTORE r10; \
37 popq %r9; \ 37 popq_cfi %r9; CFI_RESTORE r9; \
38 popq %r8; \ 38 popq_cfi %r8; CFI_RESTORE r8; \
39 popq %rcx; \ 39 popq_cfi %rcx; CFI_RESTORE rcx; \
40 popq %rsi; \ 40 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq %rdi 41 popq_cfi %rdi; CFI_RESTORE rdi
42 42
43/* Fix up special calling conventions */ 43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 44ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC
45 save_common_regs 46 save_common_regs
46 pushq %rdx 47 pushq_cfi %rdx
48 CFI_REL_OFFSET rdx, 0
47 movq %rax,%rdi 49 movq %rax,%rdi
48 call rwsem_down_read_failed 50 call rwsem_down_read_failed
49 popq %rdx 51 popq_cfi %rdx
52 CFI_RESTORE rdx
50 restore_common_regs 53 restore_common_regs
51 ret 54 ret
52 ENDPROC(call_rwsem_down_read_failed) 55 CFI_ENDPROC
56ENDPROC(call_rwsem_down_read_failed)
53 57
54ENTRY(call_rwsem_down_write_failed) 58ENTRY(call_rwsem_down_write_failed)
59 CFI_STARTPROC
55 save_common_regs 60 save_common_regs
56 movq %rax,%rdi 61 movq %rax,%rdi
57 call rwsem_down_write_failed 62 call rwsem_down_write_failed
58 restore_common_regs 63 restore_common_regs
59 ret 64 ret
60 ENDPROC(call_rwsem_down_write_failed) 65 CFI_ENDPROC
66ENDPROC(call_rwsem_down_write_failed)
61 67
62ENTRY(call_rwsem_wake) 68ENTRY(call_rwsem_wake)
69 CFI_STARTPROC
63 decl %edx /* do nothing if still outstanding active readers */ 70 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f 71 jnz 1f
65 save_common_regs 72 save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
67 call rwsem_wake 74 call rwsem_wake
68 restore_common_regs 75 restore_common_regs
691: ret 761: ret
70 ENDPROC(call_rwsem_wake) 77 CFI_ENDPROC
78ENDPROC(call_rwsem_wake)
71 79
72/* Fix up special calling conventions */ 80/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake) 81ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC
74 save_common_regs 83 save_common_regs
75 pushq %rdx 84 pushq_cfi %rdx
85 CFI_REL_OFFSET rdx, 0
76 movq %rax,%rdi 86 movq %rax,%rdi
77 call rwsem_downgrade_wake 87 call rwsem_downgrade_wake
78 popq %rdx 88 popq_cfi %rdx
89 CFI_RESTORE rdx
79 restore_common_regs 90 restore_common_regs
80 ret 91 ret
81 ENDPROC(call_rwsem_downgrade_wake) 92 CFI_ENDPROC
93ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe4741782..06691daa4108 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
36 */ 36 */
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed) 38ENTRY(__write_lock_failed)
39 CFI_STARTPROC simple 39 CFI_STARTPROC
40 FRAME 40 FRAME
412: LOCK_PREFIX 412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax) 42 addl $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
74/* Fix up special calling conventions */ 74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed) 75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC 76 CFI_STARTPROC
77 push %ecx 77 pushl_cfi %ecx
78 CFI_ADJUST_CFA_OFFSET 4
79 CFI_REL_OFFSET ecx,0 78 CFI_REL_OFFSET ecx,0
80 push %edx 79 pushl_cfi %edx
81 CFI_ADJUST_CFA_OFFSET 4
82 CFI_REL_OFFSET edx,0 80 CFI_REL_OFFSET edx,0
83 call rwsem_down_read_failed 81 call rwsem_down_read_failed
84 pop %edx 82 popl_cfi %edx
85 CFI_ADJUST_CFA_OFFSET -4 83 popl_cfi %ecx
86 pop %ecx
87 CFI_ADJUST_CFA_OFFSET -4
88 ret 84 ret
89 CFI_ENDPROC 85 CFI_ENDPROC
90 ENDPROC(call_rwsem_down_read_failed) 86 ENDPROC(call_rwsem_down_read_failed)
91 87
92ENTRY(call_rwsem_down_write_failed) 88ENTRY(call_rwsem_down_write_failed)
93 CFI_STARTPROC 89 CFI_STARTPROC
94 push %ecx 90 pushl_cfi %ecx
95 CFI_ADJUST_CFA_OFFSET 4
96 CFI_REL_OFFSET ecx,0 91 CFI_REL_OFFSET ecx,0
97 calll rwsem_down_write_failed 92 calll rwsem_down_write_failed
98 pop %ecx 93 popl_cfi %ecx
99 CFI_ADJUST_CFA_OFFSET -4
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102 ENDPROC(call_rwsem_down_write_failed) 96 ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
105 CFI_STARTPROC 99 CFI_STARTPROC
106 decw %dx /* do nothing if still outstanding active readers */ 100 decw %dx /* do nothing if still outstanding active readers */
107 jnz 1f 101 jnz 1f
108 push %ecx 102 pushl_cfi %ecx
109 CFI_ADJUST_CFA_OFFSET 4
110 CFI_REL_OFFSET ecx,0 103 CFI_REL_OFFSET ecx,0
111 call rwsem_wake 104 call rwsem_wake
112 pop %ecx 105 popl_cfi %ecx
113 CFI_ADJUST_CFA_OFFSET -4
1141: ret 1061: ret
115 CFI_ENDPROC 107 CFI_ENDPROC
116 ENDPROC(call_rwsem_wake) 108 ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
118/* Fix up special calling conventions */ 110/* Fix up special calling conventions */
119ENTRY(call_rwsem_downgrade_wake) 111ENTRY(call_rwsem_downgrade_wake)
120 CFI_STARTPROC 112 CFI_STARTPROC
121 push %ecx 113 pushl_cfi %ecx
122 CFI_ADJUST_CFA_OFFSET 4
123 CFI_REL_OFFSET ecx,0 114 CFI_REL_OFFSET ecx,0
124 push %edx 115 pushl_cfi %edx
125 CFI_ADJUST_CFA_OFFSET 4
126 CFI_REL_OFFSET edx,0 116 CFI_REL_OFFSET edx,0
127 call rwsem_downgrade_wake 117 call rwsem_downgrade_wake
128 pop %edx 118 popl_cfi %edx
129 CFI_ADJUST_CFA_OFFSET -4 119 popl_cfi %ecx
130 pop %ecx
131 CFI_ADJUST_CFA_OFFSET -4
132 ret 120 ret
133 CFI_ENDPROC 121 CFI_ENDPROC
134 ENDPROC(call_rwsem_downgrade_wake) 122 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ecc..2930ae05d773 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
7 7
8 #include <linux/linkage.h> 8 #include <linux/linkage.h>
9 9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS 10#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */ 11 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func 12 .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a5428..782b082c9ff7 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
22 CFI_ENDPROC 22 CFI_ENDPROC
23 .endm 23 .endm
24 24
25 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
26 .macro thunk_retrax name,func
27 .globl \name
28\name:
29 CFI_STARTPROC
30 SAVE_ARGS
31 call \func
32 jmp restore_norax
33 CFI_ENDPROC
34 .endm
35
36
37 .section .sched.text, "ax"
38#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
39 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
40 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
41 thunk rwsem_wake_thunk,rwsem_wake
42 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
43#endif
44
45#ifdef CONFIG_TRACE_IRQFLAGS 25#ifdef CONFIG_TRACE_IRQFLAGS
46 /* put return address in rdi (arg1) */ 26 /* put return address in rdi (arg1) */
47 .macro thunk_ra name,func 27 .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
72 RESTORE_ARGS 52 RESTORE_ARGS
73 ret 53 ret
74 CFI_ENDPROC 54 CFI_ENDPROC
75
76 CFI_STARTPROC
77 SAVE_ARGS
78restore_norax:
79 RESTORE_ARGS 1
80 ret
81 CFI_ENDPROC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8]; 29static unsigned char __initdata nodeids[8];
31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
32 30
33static __init int find_northbridge(void) 31static __init int find_northbridge(void)
34{ 32{
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
51 return num; 49 return num;
52 } 50 }
53 51
54 return -1; 52 return -ENOENT;
55} 53}
56 54
57static __init void early_get_boot_cpu_id(void) 55static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
69#endif 67#endif
70} 68}
71 69
72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 70int __init amd_numa_init(void)
73{ 71{
74 unsigned long start = PFN_PHYS(start_pfn); 72 unsigned long start = PFN_PHYS(0);
75 unsigned long end = PFN_PHYS(end_pfn); 73 unsigned long end = PFN_PHYS(max_pfn);
76 unsigned numnodes; 74 unsigned numnodes;
77 unsigned long prevbase; 75 unsigned long prevbase;
78 int i, nb, found = 0; 76 int i, j, nb;
79 u32 nodeid, reg; 77 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base;
80 79
81 if (!early_pci_allowed()) 80 if (!early_pci_allowed())
82 return -1; 81 return -EINVAL;
83 82
84 nb = find_northbridge(); 83 nb = find_northbridge();
85 if (nb < 0) 84 if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
90 reg = read_pci_config(0, nb, 0, 0x60); 89 reg = read_pci_config(0, nb, 0, 0x60);
91 numnodes = ((reg >> 4) & 0xF) + 1; 90 numnodes = ((reg >> 4) & 0xF) + 1;
92 if (numnodes <= 1) 91 if (numnodes <= 1)
93 return -1; 92 return -ENOENT;
94 93
95 pr_info("Number of physical nodes %d\n", numnodes); 94 pr_info("Number of physical nodes %d\n", numnodes);
96 95
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
121 if ((base >> 8) & 3 || (limit >> 8) & 3) { 120 if ((base >> 8) & 3 || (limit >> 8) & 3) {
122 pr_err("Node %d using interleaving mode %lx/%lx\n", 121 pr_err("Node %d using interleaving mode %lx/%lx\n",
123 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 122 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
124 return -1; 123 return -EINVAL;
125 } 124 }
126 if (node_isset(nodeid, nodes_parsed)) { 125 if (node_isset(nodeid, numa_nodes_parsed)) {
127 pr_info("Node %d already present, skipping\n", 126 pr_info("Node %d already present, skipping\n",
128 nodeid); 127 nodeid);
129 continue; 128 continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
160 if (prevbase > base) { 159 if (prevbase > base) {
161 pr_err("Node map not sorted %lx,%lx\n", 160 pr_err("Node map not sorted %lx,%lx\n",
162 prevbase, base); 161 prevbase, base);
163 return -1; 162 return -EINVAL;
164 } 163 }
165 164
166 pr_info("Node %d MemBase %016lx Limit %016lx\n", 165 pr_info("Node %d MemBase %016lx Limit %016lx\n",
167 nodeid, base, limit); 166 nodeid, base, limit);
168 167
169 found++;
170
171 nodes[nodeid].start = base;
172 nodes[nodeid].end = limit;
173
174 prevbase = base; 168 prevbase = base;
175 169 numa_add_memblk(nodeid, base, limit);
176 node_set(nodeid, nodes_parsed); 170 node_set(nodeid, numa_nodes_parsed);
177 }
178
179 if (!found)
180 return -1;
181 return 0;
182}
183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 } 171 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211 172
212/* 173 if (!nodes_weight(numa_nodes_parsed))
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 174 return -ENOENT;
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226 175
176 /*
177 * We seem to have valid NUMA configuration. Map apicids to nodes
178 * using the coreid bits from early_identify_cpu.
179 */
227 bits = boot_cpu_data.x86_coreid_bits; 180 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits; 181 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
255{
256 unsigned int bits;
257 unsigned int cores;
258 unsigned int apicid_base;
259 int i;
260
261 BUG_ON(nodes_empty(nodes_parsed));
262 node_possible_map = nodes_parsed;
263 memnode_shift = compute_hash_shift(nodes, 8, NULL);
264 if (memnode_shift < 0) {
265 pr_err("No NUMA node hash function found. Contact maintainer\n");
266 return -1;
267 }
268 pr_info("Using node hash shift of %d\n", memnode_shift);
269
270 /* use the coreid bits from early_identify_cpu */
271 bits = boot_cpu_data.x86_coreid_bits;
272 cores = (1<<bits);
273 apicid_base = 0; 182 apicid_base = 0;
183
274 /* get the APIC ID of the BSP early for systems with apicid lifting */ 184 /* get the APIC ID of the BSP early for systems with apicid lifting */
275 early_get_boot_cpu_id(); 185 early_get_boot_cpu_id();
276 if (boot_cpu_physical_apicid > 0) { 186 if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
278 apicid_base = boot_cpu_physical_apicid; 188 apicid_base = boot_cpu_physical_apicid;
279 } 189 }
280 190
281 for_each_node_mask(i, node_possible_map) { 191 for_each_node_mask(i, numa_nodes_parsed)
282 int j;
283
284 memblock_x86_register_active_regions(i,
285 nodes[i].start >> PAGE_SHIFT,
286 nodes[i].end >> PAGE_SHIFT);
287 for (j = apicid_base; j < cores + apicid_base; j++) 192 for (j = apicid_base; j < cores + apicid_base; j++)
288 apicid_to_node[(i << bits) + j] = i; 193 set_apicid_to_node((i << bits) + j, i);
289 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
290 }
291 194
292 numa_init_array();
293 return 0; 195 return 0;
294} 196}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..d4203988504a 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20 20
21unsigned long __initdata e820_table_start; 21unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata e820_table_end; 22unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata e820_table_top; 23unsigned long __meminitdata pgt_buf_top;
24 24
25int after_bootmem; 25int after_bootmem;
26 26
@@ -33,7 +33,7 @@ int direct_gbpages
33static void __init find_early_table_space(unsigned long end, int use_pse, 33static void __init find_early_table_space(unsigned long end, int use_pse,
34 int use_gbpages) 34 int use_gbpages)
35{ 35{
36 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
37 phys_addr_t base; 37 phys_addr_t base;
38 38
39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
66 /* for fixmap */ 66 /* for fixmap */
67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
68#endif
69 68
70 /* 69 good_end = max_pfn_mapped << PAGE_SHIFT;
71 * RED-PEN putting page tables only on node 0 could
72 * cause a hotspot and fill up ZONE_DMA. The page tables
73 * need roughly 0.5KB per GB.
74 */
75#ifdef CONFIG_X86_32
76 start = 0x7000;
77#else
78 start = 0x8000;
79#endif 70#endif
80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, 71
81 tables, PAGE_SIZE); 72 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
82 if (base == MEMBLOCK_ERROR) 73 if (base == MEMBLOCK_ERROR)
83 panic("Cannot find space for the kernel page tables"); 74 panic("Cannot find space for the kernel page tables");
84 75
85 e820_table_start = base >> PAGE_SHIFT; 76 pgt_buf_start = base >> PAGE_SHIFT;
86 e820_table_end = e820_table_start; 77 pgt_buf_end = pgt_buf_start;
87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 78 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
88 79
89 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 80 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
90 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
91} 82}
92 83
93struct map_range { 84struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 load_cr3(swapper_pg_dir); 270 load_cr3(swapper_pg_dir);
280#endif 271#endif
281 272
282#ifdef CONFIG_X86_64
283 if (!after_bootmem && !start) {
284 pud_t *pud;
285 pmd_t *pmd;
286
287 mmu_cr4_features = read_cr4();
288
289 /*
290 * _brk_end cannot change anymore, but it and _end may be
291 * located on different 2M pages. cleanup_highmap(), however,
292 * can only consider _end when it runs, so destroy any
293 * mappings beyond _brk_end here.
294 */
295 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
296 pmd = pmd_offset(pud, _brk_end - 1);
297 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
298 pmd_clear(pmd);
299 }
300#endif
301 __flush_tlb_all(); 273 __flush_tlb_all();
302 274
303 if (!after_bootmem && e820_table_end > e820_table_start) 275 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, 276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
305 e820_table_end << PAGE_SHIFT, "PGTABLE"); 277 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
306 278
307 if (!after_bootmem) 279 if (!after_bootmem)
308 early_memtest(start, end); 280 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..80088f994193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
62 62
63static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
64{ 64{
65 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
66 void *adr; 66 void *adr;
67 67
68 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
69 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
70 70
71 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
166 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
167 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
168 pte_t *newpte; 168 pte_t *newpte;
169 int i; 169 int i;
170 170
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
644} 644}
645 645
646#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
647void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
648 int acpi, int k8)
649{ 648{
650#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
651 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
@@ -918,7 +917,7 @@ static void mark_nxdata_nx(void)
918{ 917{
919 /* 918 /*
920 * When this called, init has already been executed and released, 919 * When this called, init has already been executed and released,
921 * so everything past _etext sould be NX. 920 * so everything past _etext should be NX.
922 */ 921 */
923 unsigned long start = PFN_ALIGN(_etext); 922 unsigned long start = PFN_ALIGN(_etext);
924 /* 923 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c14a5422e152..2362b646178e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,8 @@
51#include <asm/numa.h> 51#include <asm/numa.h>
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/init.h> 53#include <asm/init.h>
54#include <asm/uv/uv.h>
55#include <asm/setup.h>
54 56
55static int __init parse_direct_gbpages_off(char *arg) 57static int __init parse_direct_gbpages_off(char *arg)
56{ 58{
@@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
293 * to the compile time generated pmds. This results in invalid pmds up 295 * to the compile time generated pmds. This results in invalid pmds up
294 * to the point where we hit the physaddr 0 mapping. 296 * to the point where we hit the physaddr 0 mapping.
295 * 297 *
296 * We limit the mappings to the region from _text to _end. _end is 298 * We limit the mappings to the region from _text to _brk_end. _brk_end
297 * rounded up to the 2MB boundary. This catches the invalid pmds as 299 * is rounded up to the 2MB boundary. This catches the invalid pmds as
298 * well, as they are located before _text: 300 * well, as they are located before _text:
299 */ 301 */
300void __init cleanup_highmap(void) 302void __init cleanup_highmap(void)
301{ 303{
302 unsigned long vaddr = __START_KERNEL_map; 304 unsigned long vaddr = __START_KERNEL_map;
303 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
304 pmd_t *pmd = level2_kernel_pgt; 307 pmd_t *pmd = level2_kernel_pgt;
305 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
306 308
307 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
308 if (pmd_none(*pmd)) 310 if (pmd_none(*pmd))
309 continue; 311 continue;
310 if (vaddr < (unsigned long) _text || vaddr > end) 312 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -314,7 +316,7 @@ void __init cleanup_highmap(void)
314 316
315static __ref void *alloc_low_page(unsigned long *phys) 317static __ref void *alloc_low_page(unsigned long *phys)
316{ 318{
317 unsigned long pfn = e820_table_end++; 319 unsigned long pfn = pgt_buf_end++;
318 void *adr; 320 void *adr;
319 321
320 if (after_bootmem) { 322 if (after_bootmem) {
@@ -324,7 +326,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 return adr; 326 return adr;
325 } 327 }
326 328
327 if (pfn >= e820_table_top) 329 if (pfn >= pgt_buf_top)
328 panic("alloc_low_page: ran out of memory"); 330 panic("alloc_low_page: ran out of memory");
329 331
330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +335,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
333 return adr; 335 return adr;
334} 336}
335 337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
336static __ref void unmap_low_page(void *adr) 354static __ref void unmap_low_page(void *adr)
337{ 355{
338 if (after_bootmem) 356 if (after_bootmem)
339 return; 357 return;
340 358
341 early_iounmap(adr, PAGE_SIZE); 359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
342} 360}
343 361
344static unsigned long __meminit 362static unsigned long __meminit
@@ -386,15 +404,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
386} 404}
387 405
388static unsigned long __meminit 406static unsigned long __meminit
389phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
390 pgprot_t prot)
391{
392 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
393
394 return phys_pte_init(pte, address, end, prot);
395}
396
397static unsigned long __meminit
398phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 407phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
399 unsigned long page_size_mask, pgprot_t prot) 408 unsigned long page_size_mask, pgprot_t prot)
400{ 409{
@@ -420,8 +429,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
420 if (pmd_val(*pmd)) { 429 if (pmd_val(*pmd)) {
421 if (!pmd_large(*pmd)) { 430 if (!pmd_large(*pmd)) {
422 spin_lock(&init_mm.page_table_lock); 431 spin_lock(&init_mm.page_table_lock);
423 last_map_addr = phys_pte_update(pmd, address, 432 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
433 last_map_addr = phys_pte_init(pte, address,
424 end, prot); 434 end, prot);
435 unmap_low_page(pte);
425 spin_unlock(&init_mm.page_table_lock); 436 spin_unlock(&init_mm.page_table_lock);
426 continue; 437 continue;
427 } 438 }
@@ -468,18 +479,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
468} 479}
469 480
470static unsigned long __meminit 481static unsigned long __meminit
471phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
472 unsigned long page_size_mask, pgprot_t prot)
473{
474 pmd_t *pmd = pmd_offset(pud, 0);
475 unsigned long last_map_addr;
476
477 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
478 __flush_tlb_all();
479 return last_map_addr;
480}
481
482static unsigned long __meminit
483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 482phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
484 unsigned long page_size_mask) 483 unsigned long page_size_mask)
485{ 484{
@@ -504,8 +503,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
504 503
505 if (pud_val(*pud)) { 504 if (pud_val(*pud)) {
506 if (!pud_large(*pud)) { 505 if (!pud_large(*pud)) {
507 last_map_addr = phys_pmd_update(pud, addr, end, 506 pmd = map_low_page(pmd_offset(pud, 0));
507 last_map_addr = phys_pmd_init(pmd, addr, end,
508 page_size_mask, prot); 508 page_size_mask, prot);
509 unmap_low_page(pmd);
510 __flush_tlb_all();
509 continue; 511 continue;
510 } 512 }
511 /* 513 /*
@@ -553,17 +555,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
553 return last_map_addr; 555 return last_map_addr;
554} 556}
555 557
556static unsigned long __meminit
557phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
558 unsigned long page_size_mask)
559{
560 pud_t *pud;
561
562 pud = (pud_t *)pgd_page_vaddr(*pgd);
563
564 return phys_pud_init(pud, addr, end, page_size_mask);
565}
566
567unsigned long __meminit 558unsigned long __meminit
568kernel_physical_mapping_init(unsigned long start, 559kernel_physical_mapping_init(unsigned long start,
569 unsigned long end, 560 unsigned long end,
@@ -587,8 +578,10 @@ kernel_physical_mapping_init(unsigned long start,
587 next = end; 578 next = end;
588 579
589 if (pgd_val(*pgd)) { 580 if (pgd_val(*pgd)) {
590 last_map_addr = phys_pud_update(pgd, __pa(start), 581 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
582 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 583 __pa(end), page_size_mask);
584 unmap_low_page(pud);
592 continue; 585 continue;
593 } 586 }
594 587
@@ -612,10 +605,9 @@ kernel_physical_mapping_init(unsigned long start,
612} 605}
613 606
614#ifndef CONFIG_NUMA 607#ifndef CONFIG_NUMA
615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 608void __init initmem_init(void)
616 int acpi, int k8)
617{ 609{
618 memblock_x86_register_active_regions(0, start_pfn, end_pfn); 610 memblock_x86_register_active_regions(0, 0, max_pfn);
619} 611}
620#endif 612#endif
621 613
@@ -908,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma)
908 return NULL; 900 return NULL;
909} 901}
910 902
903#ifdef CONFIG_X86_UV
904#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
905
906unsigned long memory_block_size_bytes(void)
907{
908 if (is_uv_system()) {
909 printk(KERN_INFO "UV: memory block size 2GB\n");
910 return 2UL * 1024 * 1024 * 1024;
911 }
912 return MIN_MEMORY_BLOCK_SIZE;
913}
914#endif
915
911#ifdef CONFIG_SPARSEMEM_VMEMMAP 916#ifdef CONFIG_SPARSEMEM_VMEMMAP
912/* 917/*
913 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 918 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a38..9559d360fde7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
26early_param("numa", numa_setup); 26early_param("numa", numa_setup);
27 27
28/* 28/*
29 * Which logical CPUs are on which nodes 29 * apicid, cpu, node mappings
30 */ 30 */
31s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33};
34
31cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
32EXPORT_SYMBOL(node_to_cpumask_map); 36EXPORT_SYMBOL(node_to_cpumask_map);
33 37
34/* 38/*
39 * Map cpu index to node index
40 */
41DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
42EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44void __cpuinit numa_set_node(int cpu, int node)
45{
46 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
47
48 /* early setting, no percpu area yet */
49 if (cpu_to_node_map) {
50 cpu_to_node_map[cpu] = node;
51 return;
52 }
53
54#ifdef CONFIG_DEBUG_PER_CPU_MAPS
55 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
56 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
57 dump_stack();
58 return;
59 }
60#endif
61 per_cpu(x86_cpu_to_node_map, cpu) = node;
62
63 if (node != NUMA_NO_NODE)
64 set_cpu_numa_node(cpu, node);
65}
66
67void __cpuinit numa_clear_node(int cpu)
68{
69 numa_set_node(cpu, NUMA_NO_NODE);
70}
71
72/*
35 * Allocate node_to_cpumask_map based on number of available nodes 73 * Allocate node_to_cpumask_map based on number of available nodes
36 * Requires node_possible_map to be valid. 74 * Requires node_possible_map to be valid.
37 * 75 *
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
57 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
58} 96}
59 97
60#ifdef CONFIG_DEBUG_PER_CPU_MAPS 98/*
99 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
101 * mapping. To avoid this fill in the mapping for all possible CPUs,
102 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes.
104 */
105void __init numa_init_array(void)
106{
107 int rr, i;
108
109 rr = first_node(node_online_map);
110 for (i = 0; i < nr_cpu_ids; i++) {
111 if (early_cpu_to_node(i) != NUMA_NO_NODE)
112 continue;
113 numa_set_node(i, rr);
114 rr = next_node(rr, node_online_map);
115 if (rr == MAX_NUMNODES)
116 rr = first_node(node_online_map);
117 }
118}
119
120static __init int find_near_online_node(int node)
121{
122 int n, val;
123 int min_val = INT_MAX;
124 int best_node = -1;
125
126 for_each_online_node(n) {
127 val = node_distance(node, n);
128
129 if (val < min_val) {
130 min_val = val;
131 best_node = n;
132 }
133 }
134
135 return best_node;
136}
137
138/*
139 * Setup early cpu_to_node.
140 *
141 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
142 * and apicid_to_node[] tables have valid entries for a CPU.
143 * This means we skip cpu_to_node[] initialisation for NUMA
144 * emulation and faking node case (when running a kernel compiled
145 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
146 * is already initialized in a round robin manner at numa_init_array,
147 * prior to this call, and this initialization is good enough
148 * for the fake NUMA cases.
149 *
150 * Called before the per_cpu areas are setup.
151 */
152void __init init_cpu_to_node(void)
153{
154 int cpu;
155 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
156
157 BUG_ON(cpu_to_apicid == NULL);
158
159 for_each_possible_cpu(cpu) {
160 int node = numa_cpu_node(cpu);
161
162 if (node == NUMA_NO_NODE)
163 continue;
164 if (!node_online(node))
165 node = find_near_online_node(node);
166 numa_set_node(cpu, node);
167 }
168}
169
170#ifndef CONFIG_DEBUG_PER_CPU_MAPS
171
172# ifndef CONFIG_NUMA_EMU
173void __cpuinit numa_add_cpu(int cpu)
174{
175 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
176}
177
178void __cpuinit numa_remove_cpu(int cpu)
179{
180 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
181}
182# endif /* !CONFIG_NUMA_EMU */
183
184#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
185
186int __cpu_to_node(int cpu)
187{
188 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
189 printk(KERN_WARNING
190 "cpu_to_node(%d): usage too early!\n", cpu);
191 dump_stack();
192 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
193 }
194 return per_cpu(x86_cpu_to_node_map, cpu);
195}
196EXPORT_SYMBOL(__cpu_to_node);
197
198/*
199 * Same function as cpu_to_node() but used if called before the
200 * per_cpu areas are setup.
201 */
202int early_cpu_to_node(int cpu)
203{
204 if (early_per_cpu_ptr(x86_cpu_to_node_map))
205 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
206
207 if (!cpu_possible(cpu)) {
208 printk(KERN_WARNING
209 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
210 dump_stack();
211 return NUMA_NO_NODE;
212 }
213 return per_cpu(x86_cpu_to_node_map, cpu);
214}
215
216struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
217{
218 int node = early_cpu_to_node(cpu);
219 struct cpumask *mask;
220 char buf[64];
221
222 if (node == NUMA_NO_NODE) {
223 /* early_cpu_to_node() already emits a warning and trace */
224 return NULL;
225 }
226 mask = node_to_cpumask_map[node];
227 if (!mask) {
228 pr_err("node_to_cpumask_map[%i] NULL\n", node);
229 dump_stack();
230 return NULL;
231 }
232
233 cpulist_scnprintf(buf, sizeof(buf), mask);
234 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
235 enable ? "numa_add_cpu" : "numa_remove_cpu",
236 cpu, node, buf);
237 return mask;
238}
239
240# ifndef CONFIG_NUMA_EMU
241static void __cpuinit numa_set_cpumask(int cpu, int enable)
242{
243 struct cpumask *mask;
244
245 mask = debug_cpumask_set_cpu(cpu, enable);
246 if (!mask)
247 return;
248
249 if (enable)
250 cpumask_set_cpu(cpu, mask);
251 else
252 cpumask_clear_cpu(cpu, mask);
253}
254
255void __cpuinit numa_add_cpu(int cpu)
256{
257 numa_set_cpumask(cpu, 1);
258}
259
260void __cpuinit numa_remove_cpu(int cpu)
261{
262 numa_set_cpumask(cpu, 0);
263}
264# endif /* !CONFIG_NUMA_EMU */
265
61/* 266/*
62 * Returns a pointer to the bitmask of CPUs on Node 'node'. 267 * Returns a pointer to the bitmask of CPUs on Node 'node'.
63 */ 268 */
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
80 return node_to_cpumask_map[node]; 285 return node_to_cpumask_map[node];
81} 286}
82EXPORT_SYMBOL(cpumask_of_node); 287EXPORT_SYMBOL(cpumask_of_node);
83#endif 288
289#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
112static unsigned long kva_pages; 112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
113/* 119/*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 120 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 121 * a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
346 (ulong) node_remap_end_vaddr[nid]); 352 (ulong) node_remap_end_vaddr[nid]);
347} 353}
348 354
349void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 355void __init initmem_init(void)
350 int acpi, int k8)
351{ 356{
352 int nid; 357 int nid;
353 long kva_target_pfn; 358 long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
361 */ 366 */
362 367
363 get_memcfg_numa(); 368 get_memcfg_numa();
369 numa_init_array();
364 370
365 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
366 372
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1337c51b07d7..e8c00cc72033 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/acpi.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/dma.h> 20#include <asm/dma.h>
20#include <asm/numa.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
23 23
24#include "numa_internal.h"
25
24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
25EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
26 28
27struct memnode memnode; 29nodemask_t numa_nodes_parsed __initdata;
28 30
29s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 31struct memnode memnode;
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31};
32 32
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36/* 36static struct numa_meminfo numa_meminfo __initdata;
37 * Map cpu index to node index 37
38 */ 38static int numa_distance_cnt;
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 39static u8 *numa_distance;
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41 40
42/* 41/*
43 * Given a shift value, try to populate memnodemap[] 42 * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
46 * 0 if memnodmap[] too small (of shift too small) 45 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big) 46 * -1 if node overlap or lost ram (shift too big)
48 */ 47 */
49static int __init populate_memnodemap(const struct bootnode *nodes, 48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
50 int numnodes, int shift, int *nodeids)
51{ 49{
52 unsigned long addr, end; 50 unsigned long addr, end;
53 int i, res = -1; 51 int i, res = -1;
54 52
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) { 54 for (i = 0; i < mi->nr_blks; i++) {
57 addr = nodes[i].start; 55 addr = mi->blk[i].start;
58 end = nodes[i].end; 56 end = mi->blk[i].end;
59 if (addr >= end) 57 if (addr >= end)
60 continue; 58 continue;
61 if ((end >> shift) >= memnodemapsize) 59 if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
63 do { 61 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1; 63 return -1;
66 64 memnodemap[addr >> shift] = mi->blk[i].nid;
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift); 65 addr += (1UL << shift);
73 } while (addr < end); 66 } while (addr < end);
74 res = 1; 67 res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
86 79
87 addr = 0x8000; 80 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
90 nodemap_size, L1_CACHE_BYTES); 83 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == MEMBLOCK_ERROR) { 84 if (nodemap_addr == MEMBLOCK_ERROR) {
92 printk(KERN_ERR 85 printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
106 * The LSB of all start and end addresses in the node map is the value of the 99 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift. 100 * maximum possible shift.
108 */ 101 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
110 int numnodes)
111{ 103{
112 int i, nodes_used = 0; 104 int i, nodes_used = 0;
113 unsigned long start, end; 105 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0; 106 unsigned long bitfield = 0, memtop = 0;
115 107
116 for (i = 0; i < numnodes; i++) { 108 for (i = 0; i < mi->nr_blks; i++) {
117 start = nodes[i].start; 109 start = mi->blk[i].start;
118 end = nodes[i].end; 110 end = mi->blk[i].end;
119 if (start >= end) 111 if (start >= end)
120 continue; 112 continue;
121 bitfield |= start; 113 bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
131 return i; 123 return i;
132} 124}
133 125
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 126static int __init compute_hash_shift(const struct numa_meminfo *mi)
135 int *nodeids)
136{ 127{
137 int shift; 128 int shift;
138 129
139 shift = extract_lsb_from_nodes(nodes, numnodes); 130 shift = extract_lsb_from_nodes(mi);
140 if (allocate_cachealigned_memnodemap()) 131 if (allocate_cachealigned_memnodemap())
141 return -1; 132 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift); 134 shift);
144 135
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 136 if (populate_memnodemap(mi, shift) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to " 137 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE " 138 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift); 139 "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
188 return NULL; 179 return NULL;
189} 180}
190 181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
191/* Initialize bootmem allocator for a node */ 239/* Initialize bootmem allocator for a node */
192void __init 240void __init
193setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
234 node_set_online(nodeid); 282 node_set_online(nodeid);
235} 283}
236 284
237/* 285/**
238 * There are unfortunately some poorly designed mainboards around that 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
239 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 287 * @mi: numa_meminfo to clean up
240 * mapping. To avoid this fill in the mapping for all possible CPUs, 288 *
241 * as the number of CPUs is not known yet. We round robin the existing 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
242 * nodes. 290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
243 */ 294 */
244void __init numa_init_array(void) 295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
245{ 296{
246 int rr, i; 297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
247 300
248 rr = first_node(node_online_map); 301 for (i = 0; i < mi->nr_blks; i++) {
249 for (i = 0; i < nr_cpu_ids; i++) { 302 struct numa_memblk *bi = &mi->blk[i];
250 if (early_cpu_to_node(i) != NUMA_NO_NODE)
251 continue;
252 numa_set_node(i, rr);
253 rr = next_node(rr, node_online_map);
254 if (rr == MAX_NUMNODES)
255 rr = first_node(node_online_map);
256 }
257}
258
259#ifdef CONFIG_NUMA_EMU
260/* Numa emulation */
261static struct bootnode nodes[MAX_NUMNODES] __initdata;
262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
263static char *cmdline __initdata;
264 303
265void __init numa_emu_cmdline(char *str) 304 /* make sure all blocks are inside the limits */
266{ 305 bi->start = max(bi->start, low);
267 cmdline = str; 306 bi->end = min(bi->end, high);
268}
269 307
270static int __init setup_physnodes(unsigned long start, unsigned long end, 308 /* and there's no empty block */
271 int acpi, int amd) 309 if (bi->start == bi->end) {
272{ 310 numa_remove_memblk_from(i--, mi);
273 int ret = 0;
274 int i;
275
276 memset(physnodes, 0, sizeof(physnodes));
277#ifdef CONFIG_ACPI_NUMA
278 if (acpi)
279 acpi_get_nodes(physnodes, start, end);
280#endif
281#ifdef CONFIG_AMD_NUMA
282 if (amd)
283 amd_get_nodes(physnodes);
284#endif
285 /*
286 * Basic sanity checking on the physical node map: there may be errors
287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
288 * kernel parameter is used.
289 */
290 for (i = 0; i < MAX_NUMNODES; i++) {
291 if (physnodes[i].start == physnodes[i].end)
292 continue;
293 if (physnodes[i].start > end) {
294 physnodes[i].end = physnodes[i].start;
295 continue;
296 }
297 if (physnodes[i].end < start) {
298 physnodes[i].start = physnodes[i].end;
299 continue; 311 continue;
300 } 312 }
301 if (physnodes[i].start < start)
302 physnodes[i].start = start;
303 if (physnodes[i].end > end)
304 physnodes[i].end = end;
305 ret++;
306 }
307 313
308 /* 314 for (j = i + 1; j < mi->nr_blks; j++) {
309 * If no physical topology was detected, a single node is faked to cover 315 struct numa_memblk *bj = &mi->blk[j];
310 * the entire address space. 316 unsigned long start, end;
311 */
312 if (!ret) {
313 physnodes[ret].start = start;
314 physnodes[ret].end = end;
315 ret = 1;
316 }
317 return ret;
318}
319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
338/*
339 * Setups up nid to range from addr to addr + size. If the end
340 * boundary is greater than max_addr, then max_addr is used instead.
341 * The return value is 0 if there is additional memory left for
342 * allocation past addr and -1 otherwise. addr is adjusted to be at
343 * the end of the node.
344 */
345static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
346{
347 int ret = 0;
348 nodes[nid].start = *addr;
349 *addr += size;
350 if (*addr >= max_addr) {
351 *addr = max_addr;
352 ret = -1;
353 }
354 nodes[nid].end = *addr;
355 node_set(nid, node_possible_map);
356 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
357 nodes[nid].start, nodes[nid].end,
358 (nodes[nid].end - nodes[nid].start) >> 20);
359 return ret;
360}
361
362/*
363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
364 * to max_addr. The return value is the number of nodes allocated.
365 */
366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
367{
368 nodemask_t physnode_mask = NODE_MASK_NONE;
369 u64 size;
370 int big;
371 int ret = 0;
372 int i;
373
374 if (nr_nodes <= 0)
375 return -1;
376 if (nr_nodes > MAX_NUMNODES) {
377 pr_info("numa=fake=%d too large, reducing to %d\n",
378 nr_nodes, MAX_NUMNODES);
379 nr_nodes = MAX_NUMNODES;
380 }
381
382 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
383 /*
384 * Calculate the number of big nodes that can be allocated as a result
385 * of consolidating the remainder.
386 */
387 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
388 FAKE_NODE_MIN_SIZE;
389
390 size &= FAKE_NODE_MIN_HASH_MASK;
391 if (!size) {
392 pr_err("Not enough memory for each node. "
393 "NUMA emulation disabled.\n");
394 return -1;
395 }
396
397 for (i = 0; i < MAX_NUMNODES; i++)
398 if (physnodes[i].start != physnodes[i].end)
399 node_set(i, physnode_mask);
400
401 /*
402 * Continue to fill physical nodes with fake nodes until there is no
403 * memory left on any of them.
404 */
405 while (nodes_weight(physnode_mask)) {
406 for_each_node_mask(i, physnode_mask) {
407 u64 end = physnodes[i].start + size;
408 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
409
410 if (ret < big)
411 end += FAKE_NODE_MIN_SIZE;
412 317
413 /* 318 /*
414 * Continue to add memory to this fake node if its 319 * See whether there are overlapping blocks. Whine
415 * non-reserved memory is less than the per-node size. 320 * about but allow overlaps of the same nid. They
321 * will be merged below.
416 */ 322 */
417 while (end - physnodes[i].start - 323 if (bi->end > bj->start && bi->start < bj->end) {
418 memblock_x86_hole_size(physnodes[i].start, end) < size) { 324 if (bi->nid != bj->nid) {
419 end += FAKE_NODE_MIN_SIZE; 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
420 if (end > physnodes[i].end) { 326 bi->nid, bi->start, bi->end,
421 end = physnodes[i].end; 327 bj->nid, bj->start, bj->end);
422 break; 328 return -EINVAL;
423 } 329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
424 } 333 }
425 334
426 /* 335 /*
427 * If there won't be at least FAKE_NODE_MIN_SIZE of 336 * Join together blocks on the same node, holes
428 * non-reserved memory in ZONE_DMA32 for the next node, 337 * between which don't overlap with memory on other
429 * this one must extend to the boundary. 338 * nodes.
430 */
431 if (end < dma32_end && dma32_end - end -
432 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
433 end = dma32_end;
434
435 /*
436 * If there won't be enough non-reserved memory for the
437 * next node, this one must extend to the end of the
438 * physical node.
439 */ 339 */
440 if (physnodes[i].end - end - 340 if (bi->nid != bj->nid)
441 memblock_x86_hole_size(end, physnodes[i].end) < size) 341 continue;
442 end = physnodes[i].end; 342 start = max(min(bi->start, bj->start), low);
443 343 end = min(max(bi->end, bj->end), high);
444 /* 344 for (k = 0; k < mi->nr_blks; k++) {
445 * Avoid allocating more nodes than requested, which can 345 struct numa_memblk *bk = &mi->blk[k];
446 * happen as a result of rounding down each node's size 346
447 * to FAKE_NODE_MIN_SIZE. 347 if (bi->nid == bk->nid)
448 */ 348 continue;
449 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 349 if (start < bk->end && end > bk->start)
450 end = physnodes[i].end; 350 break;
451 351 }
452 if (setup_node_range(ret++, &physnodes[i].start, 352 if (k < mi->nr_blks)
453 end - physnodes[i].start, 353 continue;
454 physnodes[i].end) < 0) 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
455 node_clear(i, physnode_mask); 355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
456 } 360 }
457 } 361 }
458 return ret;
459}
460
461/*
462 * Returns the end address of a node so that there is at least `size' amount of
463 * non-reserved memory or `max_addr' is reached.
464 */
465static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
466{
467 u64 end = start + size;
468 362
469 while (end - start - memblock_x86_hole_size(start, end) < size) { 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
470 end += FAKE_NODE_MIN_SIZE; 364 mi->blk[i].start = mi->blk[i].end = 0;
471 if (end > max_addr) { 365 mi->blk[i].nid = NUMA_NO_NODE;
472 end = max_addr;
473 break;
474 }
475 } 366 }
476 return end; 367
368 return 0;
477} 369}
478 370
479/* 371/*
480 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 372 * Set nodes, which have memory in @mi, in *@nodemask.
481 * `addr' to `max_addr'. The return value is the number of nodes allocated.
482 */ 373 */
483static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
484{ 376{
485 nodemask_t physnode_mask = NODE_MASK_NONE;
486 u64 min_size;
487 int ret = 0;
488 int i; 377 int i;
489 378
490 if (!size) 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
491 return -1; 380 if (mi->blk[i].start != mi->blk[i].end &&
492 /* 381 mi->blk[i].nid != NUMA_NO_NODE)
493 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 382 node_set(mi->blk[i].nid, *nodemask);
494 * increased accordingly if the requested size is too small. This 383}
495 * creates a uniform distribution of node sizes across the entire
496 * machine (but not necessarily over physical nodes).
497 */
498 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
499 MAX_NUMNODES;
500 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
501 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
502 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
503 FAKE_NODE_MIN_HASH_MASK;
504 if (size < min_size) {
505 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
506 size >> 20, min_size >> 20);
507 size = min_size;
508 }
509 size &= FAKE_NODE_MIN_HASH_MASK;
510
511 for (i = 0; i < MAX_NUMNODES; i++)
512 if (physnodes[i].start != physnodes[i].end)
513 node_set(i, physnode_mask);
514 /*
515 * Fill physical nodes with fake nodes of size until there is no memory
516 * left on any of them.
517 */
518 while (nodes_weight(physnode_mask)) {
519 for_each_node_mask(i, physnode_mask) {
520 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
521 u64 end;
522
523 end = find_end_of_node(physnodes[i].start,
524 physnodes[i].end, size);
525 /*
526 * If there won't be at least FAKE_NODE_MIN_SIZE of
527 * non-reserved memory in ZONE_DMA32 for the next node,
528 * this one must extend to the boundary.
529 */
530 if (end < dma32_end && dma32_end - end -
531 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
532 end = dma32_end;
533 384
534 /* 385/**
535 * If there won't be enough non-reserved memory for the 386 * numa_reset_distance - Reset NUMA distance table
536 * next node, this one must extend to the end of the 387 *
537 * physical node. 388 * The current table is freed. The next numa_set_distance() call will
538 */ 389 * create a new one.
539 if (physnodes[i].end - end - 390 */
540 memblock_x86_hole_size(end, physnodes[i].end) < size) 391void __init numa_reset_distance(void)
541 end = physnodes[i].end; 392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
542 394
543 /* 395 /* numa_distance could be 1LU marking allocation failure, test cnt */
544 * Setup the fake node that will be allocated as bootmem 396 if (numa_distance_cnt)
545 * later. If setup_node_range() returns non-zero, there 397 memblock_x86_free_range(__pa(numa_distance),
546 * is no more memory available on this physical node. 398 __pa(numa_distance) + size);
547 */ 399 numa_distance_cnt = 0;
548 if (setup_node_range(ret++, &physnodes[i].start, 400 numa_distance = NULL; /* enable table creation */
549 end - physnodes[i].start,
550 physnodes[i].end) < 0)
551 node_clear(i, physnode_mask);
552 }
553 }
554 return ret;
555} 401}
556 402
557/* 403static int __init numa_alloc_distance(void)
558 * Sets up the system RAM area from start_pfn to last_pfn according to the
559 * numa=fake command-line option.
560 */
561static int __init numa_emulation(unsigned long start_pfn,
562 unsigned long last_pfn, int acpi, int amd)
563{ 404{
564 u64 addr = start_pfn << PAGE_SHIFT; 405 nodemask_t nodes_parsed;
565 u64 max_addr = last_pfn << PAGE_SHIFT; 406 size_t size;
566 int num_nodes; 407 int i, j, cnt = 0;
567 int i; 408 u64 phys;
568 409
569 /* 410 /* size the new table and allocate it */
570 * If the numa=fake command-line contains a 'M' or 'G', it represents 411 nodes_parsed = numa_nodes_parsed;
571 * the fixed node size. Otherwise, if it is just a single number N, 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
572 * split the system RAM into N fake nodes.
573 */
574 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
575 u64 size;
576 413
577 size = memparse(cmdline, &cmdline); 414 for_each_node_mask(i, nodes_parsed)
578 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 415 cnt = i;
579 } else { 416 cnt++;
580 unsigned long n; 417 size = cnt * cnt * sizeof(numa_distance[0]);
581 418
582 n = simple_strtoul(cmdline, NULL, 0); 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
583 num_nodes = split_nodes_interleave(addr, max_addr, n); 420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
584 } 426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
585 428
586 if (num_nodes < 0) 429 numa_distance = __va(phys);
587 return num_nodes; 430 numa_distance_cnt = cnt;
588 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 431
589 if (memnode_shift < 0) { 432 /* fill with the default distances */
590 memnode_shift = 0; 433 for (i = 0; i < cnt; i++)
591 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 434 for (j = 0; j < cnt; j++)
592 "disabled.\n"); 435 numa_distance[i * cnt + j] = i == j ?
593 return -1; 436 LOCAL_DISTANCE : REMOTE_DISTANCE;
594 } 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
595 438
596 /*
597 * We need to vacate all active ranges that may have been registered for
598 * the e820 memory map.
599 */
600 remove_all_active_ranges();
601 for_each_node_mask(i, node_possible_map) {
602 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
603 nodes[i].end >> PAGE_SHIFT);
604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
605 }
606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
608 numa_init_array();
609 return 0; 439 return 0;
610} 440}
611#endif /* CONFIG_NUMA_EMU */
612 441
613void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 442/**
614 int acpi, int amd) 443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
615{ 461{
616 int i; 462 if (!numa_distance && numa_alloc_distance() < 0)
617
618 nodes_clear(node_possible_map);
619 nodes_clear(node_online_map);
620
621#ifdef CONFIG_NUMA_EMU
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
625 return; 463 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
628 nodes_clear(node_possible_map);
629 nodes_clear(node_online_map);
630#endif
631 464
632#ifdef CONFIG_ACPI_NUMA 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
633 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
634 last_pfn << PAGE_SHIFT)) 467 from, to, distance);
635 return; 468 return;
636 nodes_clear(node_possible_map); 469 }
637 nodes_clear(node_online_map);
638#endif
639 470
640#ifdef CONFIG_AMD_NUMA 471 if ((u8)distance != distance ||
641 if (!numa_off && amd && !amd_scan_nodes()) 472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
642 return; 475 return;
643 nodes_clear(node_possible_map); 476 }
644 nodes_clear(node_online_map);
645#endif
646 printk(KERN_INFO "%s\n",
647 numa_off ? "NUMA turned off" : "No NUMA configuration found");
648 477
649 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 478 numa_distance[from * numa_distance_cnt + to] = distance;
650 start_pfn << PAGE_SHIFT,
651 last_pfn << PAGE_SHIFT);
652 /* setup dummy node covering all memory */
653 memnode_shift = 63;
654 memnodemap = memnode.embedded_map;
655 memnodemap[0] = 0;
656 node_set_online(0);
657 node_set(0, node_possible_map);
658 for (i = 0; i < nr_cpu_ids; i++)
659 numa_set_node(i, 0);
660 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
661 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
662} 479}
663 480
664unsigned long __init numa_free_all_bootmem(void) 481int __node_distance(int from, int to)
665{ 482{
666 unsigned long pages = 0; 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
667 int i; 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
668 488
669 for_each_online_node(i) 489/*
670 pages += free_all_bootmem_node(NODE_DATA(i)); 490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
671 497
672 pages += free_all_memory_core_early(MAX_NUMNODES); 498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
673 507
674 return pages; 508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
675} 518}
676 519
677#ifdef CONFIG_NUMA 520static int __init numa_register_memblks(struct numa_meminfo *mi)
678
679static __init int find_near_online_node(int node)
680{ 521{
681 int n, val; 522 int i, nid;
682 int min_val = INT_MAX;
683 int best_node = -1;
684 523
685 for_each_online_node(n) { 524 /* Account for nodes with cpus and no memory */
686 val = node_distance(node, n); 525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
687 529
688 if (val < min_val) { 530 memnode_shift = compute_hash_shift(mi);
689 min_val = val; 531 if (memnode_shift < 0) {
690 best_node = n; 532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
691 } 556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
692 } 560 }
693 561
694 return best_node; 562 return 0;
695} 563}
696 564
697/* 565/**
698 * Setup early cpu_to_node. 566 * dummy_numma_init - Fallback dummy NUMA init
699 * 567 *
700 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 568 * Used if there's no underlying NUMA architecture, NUMA initialization
701 * and apicid_to_node[] tables have valid entries for a CPU. 569 * fails, or NUMA is disabled on the command line.
702 * This means we skip cpu_to_node[] initialisation for NUMA
703 * emulation and faking node case (when running a kernel compiled
704 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
705 * is already initialized in a round robin manner at numa_init_array,
706 * prior to this call, and this initialization is good enough
707 * for the fake NUMA cases.
708 * 570 *
709 * Called before the per_cpu areas are setup. 571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
710 */ 573 */
711void __init init_cpu_to_node(void) 574static int __init dummy_numa_init(void)
712{ 575{
713 int cpu; 576 printk(KERN_INFO "%s\n",
714 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
715 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
716 BUG_ON(cpu_to_apicid == NULL); 579 0LU, max_pfn << PAGE_SHIFT);
717 580
718 for_each_possible_cpu(cpu) { 581 node_set(0, numa_nodes_parsed);
719 int node; 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
720 u16 apicid = cpu_to_apicid[cpu];
721 583
722 if (apicid == BAD_APICID) 584 return 0;
723 continue;
724 node = apicid_to_node[apicid];
725 if (node == NUMA_NO_NODE)
726 continue;
727 if (!node_online(node))
728 node = find_near_online_node(node);
729 numa_set_node(cpu, node);
730 }
731} 585}
732#endif
733 586
734 587static int __init numa_init(int (*init_func)(void))
735void __cpuinit numa_set_node(int cpu, int node)
736{ 588{
737 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 589 int i;
738 590 int ret;
739 /* early setting, no percpu area yet */
740 if (cpu_to_node_map) {
741 cpu_to_node_map[cpu] = node;
742 return;
743 }
744
745#ifdef CONFIG_DEBUG_PER_CPU_MAPS
746 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
747 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
748 dump_stack();
749 return;
750 }
751#endif
752 per_cpu(x86_cpu_to_node_map, cpu) = node;
753 591
754 if (node != NUMA_NO_NODE) 592 for (i = 0; i < MAX_LOCAL_APIC; i++)
755 set_cpu_numa_node(cpu, node); 593 set_apicid_to_node(i, NUMA_NO_NODE);
756}
757 594
758void __cpuinit numa_clear_node(int cpu) 595 nodes_clear(numa_nodes_parsed);
759{ 596 nodes_clear(node_possible_map);
760 numa_set_node(cpu, NUMA_NO_NODE); 597 nodes_clear(node_online_map);
761} 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
762 599 remove_all_active_ranges();
763#ifndef CONFIG_DEBUG_PER_CPU_MAPS 600 numa_reset_distance();
764 601
765#ifndef CONFIG_NUMA_EMU 602 ret = init_func();
766void __cpuinit numa_add_cpu(int cpu) 603 if (ret < 0)
767{ 604 return ret;
768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 605 ret = numa_cleanup_meminfo(&numa_meminfo);
769} 606 if (ret < 0)
607 return ret;
770 608
771void __cpuinit numa_remove_cpu(int cpu) 609 numa_emulation(&numa_meminfo, numa_distance_cnt);
772{
773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782 610
783 nid = early_cpu_to_node(cpu); 611 ret = numa_register_memblks(&numa_meminfo);
784 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 612 if (ret < 0)
613 return ret;
785 614
786 /* 615 for (i = 0; i < nr_cpu_ids; i++) {
787 * Use the starting address of the emulated node to find which physical 616 int nid = early_cpu_to_node(i);
788 * node it is allocated on.
789 */
790 addr = node_start_pfn(nid) << PAGE_SHIFT;
791 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
792 if (addr >= physnodes[physnid].start &&
793 addr < physnodes[physnid].end)
794 break;
795 617
796 /* 618 if (nid == NUMA_NO_NODE)
797 * Map the cpu to each emulated node that is allocated on the physical 619 continue;
798 * node of the cpu's apic id. 620 if (!node_online(nid))
799 */ 621 numa_clear_node(i);
800 for_each_online_node(nid) {
801 addr = node_start_pfn(nid) << PAGE_SHIFT;
802 if (addr >= physnodes[physnid].start &&
803 addr < physnodes[physnid].end)
804 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
805 } 622 }
623 numa_init_array();
624 return 0;
806} 625}
807 626
808void __cpuinit numa_remove_cpu(int cpu) 627void __init initmem_init(void)
809{ 628{
810 int i; 629 int ret;
811 630
812 for_each_online_node(i) 631 if (!numa_off) {
813 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 632#ifdef CONFIG_ACPI_NUMA
814} 633 ret = numa_init(x86_acpi_numa_init);
815#endif /* !CONFIG_NUMA_EMU */ 634 if (!ret)
816 635 return;
817#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 636#endif
818static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 637#ifdef CONFIG_AMD_NUMA
819{ 638 ret = numa_init(amd_numa_init);
820 int node = early_cpu_to_node(cpu); 639 if (!ret)
821 struct cpumask *mask; 640 return;
822 char buf[64]; 641#endif
823
824 mask = node_to_cpumask_map[node];
825 if (!mask) {
826 pr_err("node_to_cpumask_map[%i] NULL\n", node);
827 dump_stack();
828 return NULL;
829 } 642 }
830 643
831 cpulist_scnprintf(buf, sizeof(buf), mask); 644 numa_init(dummy_numa_init);
832 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
833 enable ? "numa_add_cpu" : "numa_remove_cpu",
834 cpu, node, buf);
835 return mask;
836} 645}
837 646
838/* 647unsigned long __init numa_free_all_bootmem(void)
839 * --------- debug versions of the numa functions ---------
840 */
841#ifndef CONFIG_NUMA_EMU
842static void __cpuinit numa_set_cpumask(int cpu, int enable)
843{
844 struct cpumask *mask;
845
846 mask = debug_cpumask_set_cpu(cpu, enable);
847 if (!mask)
848 return;
849
850 if (enable)
851 cpumask_set_cpu(cpu, mask);
852 else
853 cpumask_clear_cpu(cpu, mask);
854}
855#else
856static void __cpuinit numa_set_cpumask(int cpu, int enable)
857{ 648{
858 int node = early_cpu_to_node(cpu); 649 unsigned long pages = 0;
859 struct cpumask *mask;
860 int i; 650 int i;
861 651
862 for_each_online_node(i) { 652 for_each_online_node(i)
863 unsigned long addr; 653 pages += free_all_bootmem_node(NODE_DATA(i));
864
865 addr = node_start_pfn(i) << PAGE_SHIFT;
866 if (addr < physnodes[node].start ||
867 addr >= physnodes[node].end)
868 continue;
869 mask = debug_cpumask_set_cpu(cpu, enable);
870 if (!mask)
871 return;
872
873 if (enable)
874 cpumask_set_cpu(cpu, mask);
875 else
876 cpumask_clear_cpu(cpu, mask);
877 }
878}
879#endif /* CONFIG_NUMA_EMU */
880 654
881void __cpuinit numa_add_cpu(int cpu) 655 pages += free_all_memory_core_early(MAX_NUMNODES);
882{
883 numa_set_cpumask(cpu, 1);
884}
885 656
886void __cpuinit numa_remove_cpu(int cpu) 657 return pages;
887{
888 numa_set_cpumask(cpu, 0);
889} 658}
890 659
891int __cpu_to_node(int cpu) 660int __cpuinit numa_cpu_node(int cpu)
892{ 661{
893 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
894 printk(KERN_WARNING
895 "cpu_to_node(%d): usage too early!\n", cpu);
896 dump_stack();
897 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
898 }
899 return per_cpu(x86_cpu_to_node_map, cpu);
900}
901EXPORT_SYMBOL(__cpu_to_node);
902 663
903/* 664 if (apicid != BAD_APICID)
904 * Same function as cpu_to_node() but used if called before the 665 return __apicid_to_node[apicid];
905 * per_cpu areas are setup. 666 return NUMA_NO_NODE;
906 */
907int early_cpu_to_node(int cpu)
908{
909 if (early_per_cpu_ptr(x86_cpu_to_node_map))
910 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
911
912 if (!cpu_possible(cpu)) {
913 printk(KERN_WARNING
914 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
915 dump_stack();
916 return NUMA_NO_NODE;
917 }
918 return per_cpu(x86_cpu_to_node_map, cpu);
919} 667}
920
921/*
922 * --------- end of debug versions of the numa functions ---------
923 */
924
925#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..ad091e4cff17
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/**
271 * numa_emulation - Emulate NUMA nodes
272 * @numa_meminfo: NUMA configuration to massage
273 * @numa_dist_cnt: The size of the physical NUMA distance table
274 *
275 * Emulate NUMA nodes according to the numa=fake kernel parameter.
276 * @numa_meminfo contains the physical memory configuration and is modified
277 * to reflect the emulated configuration on success. @numa_dist_cnt is
278 * used to determine the size of the physical distance table.
279 *
280 * On success, the following modifications are made.
281 *
282 * - @numa_meminfo is updated to reflect the emulated nodes.
283 *
284 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
285 * emulated nodes.
286 *
287 * - NUMA distance table is rebuilt to represent distances between emulated
288 * nodes. The distances are determined considering how emulated nodes
289 * are mapped to physical nodes and match the actual distances.
290 *
291 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
292 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
293 *
294 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
295 * identity mapping and no other modification is made.
296 */
297void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{
299 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT;
302 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid;
305 int i, j, ret;
306
307 if (!emu_cmdline)
308 goto no_emu;
309
310 memset(&ei, 0, sizeof(ei));
311 pi = *numa_meminfo;
312
313 for (i = 0; i < MAX_NUMNODES; i++)
314 emu_nid_to_phys[i] = NUMA_NO_NODE;
315
316 /*
317 * If the numa=fake command-line contains a 'M' or 'G', it represents
318 * the fixed node size. Otherwise, if it is just a single number N,
319 * split the system RAM into N fake nodes.
320 */
321 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
322 u64 size;
323
324 size = memparse(emu_cmdline, &emu_cmdline);
325 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
326 } else {
327 unsigned long n;
328
329 n = simple_strtoul(emu_cmdline, NULL, 0);
330 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
331 }
332
333 if (ret < 0)
334 goto no_emu;
335
336 if (numa_cleanup_meminfo(&ei) < 0) {
337 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
338 goto no_emu;
339 }
340
341 /* copy the physical distance table */
342 if (numa_dist_cnt) {
343 u64 phys;
344
345 phys = memblock_find_in_range(0,
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
350 goto no_emu;
351 }
352 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
353 phys_dist = __va(phys);
354
355 for (i = 0; i < numa_dist_cnt; i++)
356 for (j = 0; j < numa_dist_cnt; j++)
357 phys_dist[i * numa_dist_cnt + j] =
358 node_distance(i, j);
359 }
360
361 /*
362 * Determine the max emulated nid and the default phys nid to use
363 * for unmapped nodes.
364 */
365 max_emu_nid = 0;
366 dfl_phys_nid = NUMA_NO_NODE;
367 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
368 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
369 max_emu_nid = i;
370 if (dfl_phys_nid == NUMA_NO_NODE)
371 dfl_phys_nid = emu_nid_to_phys[i];
372 }
373 }
374 if (dfl_phys_nid == NUMA_NO_NODE) {
375 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
376 goto no_emu;
377 }
378
379 /* commit */
380 *numa_meminfo = ei;
381
382 /*
383 * Transform __apicid_to_node table to use emulated nids by
384 * reverse-mapping phys_nid. The maps should always exist but fall
385 * back to zero just in case.
386 */
387 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
388 if (__apicid_to_node[i] == NUMA_NO_NODE)
389 continue;
390 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
391 if (__apicid_to_node[i] == emu_nid_to_phys[j])
392 break;
393 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
394 }
395
396 /* make sure all emulated nodes are mapped to a physical node */
397 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
398 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
399 emu_nid_to_phys[i] = dfl_phys_nid;
400
401 /* transform distance table */
402 numa_reset_distance();
403 for (i = 0; i < max_emu_nid + 1; i++) {
404 for (j = 0; j < max_emu_nid + 1; j++) {
405 int physi = emu_nid_to_phys[i];
406 int physj = emu_nid_to_phys[j];
407 int dist;
408
409 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
410 dist = physi == physj ?
411 LOCAL_DISTANCE : REMOTE_DISTANCE;
412 else
413 dist = phys_dist[physi * numa_dist_cnt + physj];
414
415 numa_set_distance(i, j, dist);
416 }
417 }
418
419 /* free the copied physical distance table */
420 if (phys_dist)
421 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
422 return;
423
424no_emu:
425 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
426 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
427 emu_nid_to_phys[i] = i;
428}
429
430#ifndef CONFIG_DEBUG_PER_CPU_MAPS
431void __cpuinit numa_add_cpu(int cpu)
432{
433 int physnid, nid;
434
435 nid = early_cpu_to_node(cpu);
436 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
437
438 physnid = emu_nid_to_phys[nid];
439
440 /*
441 * Map the cpu to each emulated node that is allocated on the physical
442 * node of the cpu's apic id.
443 */
444 for_each_online_node(nid)
445 if (emu_nid_to_phys[nid] == physnid)
446 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
447}
448
449void __cpuinit numa_remove_cpu(int cpu)
450{
451 int i;
452
453 for_each_online_node(i)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, int enable)
458{
459 struct cpumask *mask;
460 int nid, physnid, i;
461
462 nid = early_cpu_to_node(cpu);
463 if (nid == NUMA_NO_NODE) {
464 /* early_cpu_to_node() already emits a warning and trace */
465 return;
466 }
467
468 physnid = emu_nid_to_phys[nid];
469
470 for_each_online_node(i) {
471 if (emu_nid_to_phys[nid] != physnid)
472 continue;
473
474 mask = debug_cpumask_set_cpu(cpu, enable);
475 if (!mask)
476 return;
477
478 if (enable)
479 cpumask_set_cpu(cpu, mask);
480 else
481 cpumask_clear_cpu(cpu, mask);
482 }
483}
484
485void __cpuinit numa_add_cpu(int cpu)
486{
487 numa_set_cpumask(cpu, 1);
488}
489
490void __cpuinit numa_remove_cpu(int cpu)
491{
492 numa_set_cpumask(cpu, 0);
493}
494#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 90825f2eb0f4..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -310,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
310 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
311 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
312 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
313 * aswell. 313 * as well.
314 */ 314 */
315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
316 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0113d19c8aa6..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -168,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
168 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
169 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
170 */ 170 */
171 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
172 write_cr3(read_cr3());
173} 172}
174#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
175 174
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051d..48651c6f657d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; 57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58 58
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61 61
62int acpi_numa __initdata; 62int acpi_numa __initdata;
63 63
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void)
254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n", 254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
255 num_memory_chunks); 255 num_memory_chunks);
256 256
257 for (i = 0; i < MAX_APICID; i++) 257 for (i = 0; i < MAX_LOCAL_APIC; i++)
258 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 258 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
259 259
260 for (j = 0; j < num_memory_chunks; j++){ 260 for (j = 0; j < num_memory_chunks; j++){
261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
32static nodemask_t cpu_nodes_parsed __initdata;
33static struct bootnode nodes[MAX_NUMNODES] __initdata;
34static struct bootnode nodes_add[MAX_NUMNODES]; 29static struct bootnode nodes_add[MAX_NUMNODES];
35 30
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
40static __init int setup_node(int pxm) 31static __init int setup_node(int pxm)
41{ 32{
42 return acpi_map_pxm_to_node(pxm); 33 return acpi_map_pxm_to_node(pxm);
43} 34}
44 35
45static __init int conflicting_memblks(unsigned long start, unsigned long end)
46{
47 int i;
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
53 return memblk_nodeid[i];
54 if (nd->end == end && nd->start == start)
55 return memblk_nodeid[i];
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
62 struct bootnode *nd = &nodes[i];
63
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void) 36static __init void bad_srat(void)
77{ 37{
78 int i;
79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 38 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1; 39 acpi_numa = -1;
81 for (i = 0; i < MAX_LOCAL_APIC; i++) 40 memset(nodes_add, 0, sizeof(nodes_add));
82 apicid_to_node[i] = NUMA_NO_NODE;
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
87 remove_all_active_ranges();
88} 41}
89 42
90static __init inline int srat_disabled(void) 43static __init inline int srat_disabled(void)
91{ 44{
92 return numa_off || acpi_numa < 0; 45 return acpi_numa < 0;
93} 46}
94 47
95/* Callback for SLIT parsing */ 48/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 49void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{ 50{
98 unsigned length; 51 int i, j;
99 unsigned long phys;
100
101 length = slit->header.length;
102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
103 PAGE_SIZE);
104
105 if (phys == MEMBLOCK_ERROR)
106 panic(" Can not save slit!\n");
107 52
108 acpi_slit = __va(phys); 53 for (i = 0; i < slit->locality_count; i++)
109 memcpy(acpi_slit, slit, length); 54 for (j = 0; j < slit->locality_count; j++)
110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); 55 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
56 slit->entry[slit->locality_count * i + j]);
111} 57}
112 58
113/* Callback for Proximity Domain -> x2APIC mapping */ 59/* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 84 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return; 85 return;
140 } 86 }
141 apicid_to_node[apic_id] = node; 87 set_apicid_to_node(apic_id, node);
142 node_set(node, cpu_nodes_parsed); 88 node_set(node, numa_nodes_parsed);
143 acpi_numa = 1; 89 acpi_numa = 1;
144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 90 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
145 pxm, apic_id, node); 91 pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
178 return; 124 return;
179 } 125 }
180 126
181 apicid_to_node[apic_id] = node; 127 set_apicid_to_node(apic_id, node);
182 node_set(node, cpu_nodes_parsed); 128 node_set(node, numa_nodes_parsed);
183 acpi_numa = 1; 129 acpi_numa = 1;
184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 130 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
185 pxm, apic_id, node); 131 pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
241 } 187 }
242 188
243 if (changed) { 189 if (changed) {
244 node_set(node, cpu_nodes_parsed); 190 node_set(node, numa_nodes_parsed);
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end); 192 nd->start, nd->end);
247 } 193 }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
251void __init 197void __init
252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
253{ 199{
254 struct bootnode *nd, oldnode;
255 unsigned long start, end; 200 unsigned long start, end;
256 int node, pxm; 201 int node, pxm;
257 int i;
258 202
259 if (srat_disabled()) 203 if (srat_disabled())
260 return; 204 return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
276 bad_srat(); 220 bad_srat();
277 return; 221 return;
278 } 222 }
279 i = conflicting_memblks(start, end); 223
280 if (i == node) { 224 if (numa_add_memblk(node, start, end) < 0) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
285 printk(KERN_ERR
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
289 bad_srat(); 225 bad_srat();
290 return; 226 return;
291 } 227 }
292 nd = &nodes[node];
293 oldnode = *nd;
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
303 228
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end); 230 start, end);
306 231
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
308 update_nodes_add(node, start, end); 233 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
319}
320
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
323static int __init nodes_cover_memory(const struct bootnode *nodes)
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
333 pxmram -= __absent_pages_in_range(i, s, e);
334 if ((long)pxmram < 0)
335 pxmram = 0;
336 }
337
338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348} 234}
349 235
350void __init acpi_numa_arch_fixup(void) {} 236void __init acpi_numa_arch_fixup(void) {}
351 237
352#ifdef CONFIG_NUMA_EMU 238int __init x86_acpi_numa_init(void)
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
355{
356 int i;
357
358 for_each_node_mask(i, nodes_parsed) {
359 cutoff_node(i, start, end);
360 physnodes[i].start = nodes[i].start;
361 physnodes[i].end = nodes[i].end;
362 }
363}
364#endif /* CONFIG_NUMA_EMU */
365
366/* Use the information discovered above to actually set up the nodes. */
367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
368{ 239{
369 int i; 240 int ret;
370
371 if (acpi_numa <= 0)
372 return -1;
373
374 /* First clean up the node list */
375 for (i = 0; i < MAX_NUMNODES; i++)
376 cutoff_node(i, start, end);
377
378 /*
379 * Join together blocks on the same node, holes between
380 * which don't overlap with memory on other nodes.
381 */
382 for (i = 0; i < num_node_memblks; ++i) {
383 int j, k;
384
385 for (j = i + 1; j < num_node_memblks; ++j) {
386 unsigned long start, end;
387
388 if (memblk_nodeid[i] != memblk_nodeid[j])
389 continue;
390 start = min(node_memblk_range[i].end,
391 node_memblk_range[j].end);
392 end = max(node_memblk_range[i].start,
393 node_memblk_range[j].start);
394 for (k = 0; k < num_node_memblks; ++k) {
395 if (memblk_nodeid[i] == memblk_nodeid[k])
396 continue;
397 if (start < node_memblk_range[k].end &&
398 end > node_memblk_range[k].start)
399 break;
400 }
401 if (k < num_node_memblks)
402 continue;
403 start = min(node_memblk_range[i].start,
404 node_memblk_range[j].start);
405 end = max(node_memblk_range[i].end,
406 node_memblk_range[j].end);
407 printk(KERN_INFO "SRAT: Node %d "
408 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
409 memblk_nodeid[i],
410 node_memblk_range[i].start,
411 node_memblk_range[i].end,
412 node_memblk_range[j].start,
413 node_memblk_range[j].end,
414 start, end);
415 node_memblk_range[i].start = start;
416 node_memblk_range[i].end = end;
417 k = --num_node_memblks - j;
418 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
419 k * sizeof(*memblk_nodeid));
420 memmove(node_memblk_range + j, node_memblk_range + j+1,
421 k * sizeof(*node_memblk_range));
422 --j;
423 }
424 }
425
426 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
427 memblk_nodeid);
428 if (memnode_shift < 0) {
429 printk(KERN_ERR
430 "SRAT: No NUMA node hash function found. Contact maintainer\n");
431 bad_srat();
432 return -1;
433 }
434
435 for (i = 0; i < num_node_memblks; i++)
436 memblock_x86_register_active_regions(memblk_nodeid[i],
437 node_memblk_range[i].start >> PAGE_SHIFT,
438 node_memblk_range[i].end >> PAGE_SHIFT);
439
440 /* for out of order entries in SRAT */
441 sort_node_map();
442 if (!nodes_cover_memory(nodes)) {
443 bad_srat();
444 return -1;
445 }
446 241
447 /* Account for nodes with cpus and no memory */ 242 ret = acpi_numa_init();
448 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 243 if (ret < 0)
449 244 return ret;
450 /* Finally register nodes */ 245 return srat_disabled() ? -EINVAL : 0;
451 for_each_node_mask(i, node_possible_map)
452 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
453 /* Try again in case setup_node_bootmem missed one due
454 to missing bootmem */
455 for_each_node_mask(i, node_possible_map)
456 if (!node_online(i))
457 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
458
459 for (i = 0; i < nr_cpu_ids; i++) {
460 int node = early_cpu_to_node(i);
461
462 if (node == NUMA_NO_NODE)
463 continue;
464 if (!node_online(node))
465 numa_clear_node(i);
466 }
467 numa_init_array();
468 return 0;
469}
470
471#ifdef CONFIG_NUMA_EMU
472static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
473 [0 ... MAX_NUMNODES-1] = PXM_INVAL
474};
475static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
476 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
477};
478static int __init find_node_by_addr(unsigned long addr)
479{
480 int ret = NUMA_NO_NODE;
481 int i;
482
483 for_each_node_mask(i, nodes_parsed) {
484 /*
485 * Find the real node that this emulated node appears on. For
486 * the sake of simplicity, we only use a real node's starting
487 * address to determine which emulated node it appears on.
488 */
489 if (addr >= nodes[i].start && addr < nodes[i].end) {
490 ret = i;
491 break;
492 }
493 }
494 return ret;
495} 246}
496 247
497/*
498 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
499 * mappings that respect the real ACPI topology but reflect our emulated
500 * environment. For each emulated node, we find which real node it appears on
501 * and create PXM to NID mappings for those fake nodes which mirror that
502 * locality. SLIT will now represent the correct distances between emulated
503 * nodes as a result of the real topology.
504 */
505void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
506{
507 int i, j;
508
509 for (i = 0; i < num_nodes; i++) {
510 int nid, pxm;
511
512 nid = find_node_by_addr(fake_nodes[i].start);
513 if (nid == NUMA_NO_NODE)
514 continue;
515 pxm = node_to_pxm(nid);
516 if (pxm == PXM_INVAL)
517 continue;
518 fake_node_to_pxm_map[i] = pxm;
519 /*
520 * For each apicid_to_node mapping that exists for this real
521 * node, it must now point to the fake node ID.
522 */
523 for (j = 0; j < MAX_LOCAL_APIC; j++)
524 if (apicid_to_node[j] == nid &&
525 fake_apicid_to_node[j] == NUMA_NO_NODE)
526 fake_apicid_to_node[j] = i;
527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
539 for (i = 0; i < num_nodes; i++)
540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
542
543 nodes_clear(nodes_parsed);
544 for (i = 0; i < num_nodes; i++)
545 if (fake_nodes[i].start != fake_nodes[i].end)
546 node_set(i, nodes_parsed);
547}
548
549static int null_slit_node_compare(int a, int b)
550{
551 return node_to_pxm(a) == node_to_pxm(b);
552}
553#else
554static int null_slit_node_compare(int a, int b)
555{
556 return a == b;
557}
558#endif /* CONFIG_NUMA_EMU */
559
560int __node_distance(int a, int b)
561{
562 int index;
563
564 if (!acpi_slit)
565 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
566 REMOTE_DISTANCE;
567 index = acpi_slit->locality_count * node_to_pxm(a);
568 return acpi_slit->entry[index + node_to_pxm(b)];
569}
570
571EXPORT_SYMBOL(__node_distance);
572
573#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) 248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
574int memory_add_physaddr_to_nid(u64 start) 249int memory_add_physaddr_to_nid(u64 start)
575{ 250{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
179 sender = this_cpu_read(tlb_vector_offset); 179 sender = this_cpu_read(tlb_vector_offset);
180 f = &flush_state[sender]; 180 f = &flush_state[sender];
181 181
182 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
184 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
185 * probably not worth checking this for a cache-hot lock.
186 */
187 raw_spin_lock(&f->tlbstate_lock);
188 184
189 f->flush_mm = mm; 185 f->flush_mm = mm;
190 f->flush_va = va; 186 f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
202 198
203 f->flush_mm = NULL; 199 f->flush_mm = NULL;
204 f->flush_va = 0; 200 f->flush_va = 0;
205 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
206} 203}
207 204
208void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
211 if (is_uv_system()) { 208 if (is_uv_system()) {
212 unsigned int cpu; 209 unsigned int cpu;
213 210
214 cpu = get_cpu(); 211 cpu = smp_processor_id();
215 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
216 if (cpumask) 213 if (cpumask)
217 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
218 put_cpu();
219 return; 215 return;
220 } 216 }
221 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 72cbec14d783..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 129 dump_trace(NULL, regs, (unsigned long *)stack, 0,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 9fadec074142..98ab13058f89 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -50,7 +50,7 @@ static inline void setup_num_counters(void)
50#endif 50#endif
51} 51}
52 52
53static int inline addr_increment(void) 53static inline int addr_increment(void)
54{ 54{
55#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
56 return smp_num_siblings == 2 ? 2 : 1; 56 return smp_num_siblings == 2 ? 2 : 1;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e27dffbbb1a7..026e4931d162 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void)
350 350
351#define ENABLE_CF8_EXT_CFG (1ULL << 46) 351#define ENABLE_CF8_EXT_CFG (1ULL << 46)
352 352
353static void enable_pci_io_ecs(void *unused) 353static void __cpuinit enable_pci_io_ecs(void *unused)
354{ 354{
355 u64 reg; 355 u64 reg;
356 rdmsrl(MSR_AMD64_NB_CFG, reg); 356 rdmsrl(MSR_AMD64_NB_CFG, reg);
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 9260b3eb18d4..67858be4b52b 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -255,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
255static int ce4100_conf_read(unsigned int seg, unsigned int bus, 255static int ce4100_conf_read(unsigned int seg, unsigned int bus,
256 unsigned int devfn, int reg, int len, u32 *value) 256 unsigned int devfn, int reg, int len, u32 *value)
257{ 257{
258 int i, retval = 1; 258 int i;
259 259
260 if (bus == 1) { 260 if (bus == 1) {
261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b1805b78842f..494f2e7ea2b4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void)
241 e820_reserve_resources_late(); 241 e820_reserve_resources_late();
242 /* 242 /*
243 * Insert the IO APIC resources after PCI initialization has 243 * Insert the IO APIC resources after PCI initialization has
244 * occured to handle IO APICS that are mapped in on a BAR in 244 * occurred to handle IO APICS that are mapped in on a BAR in
245 * PCI space, but before trying to assign unassigned pci res. 245 * PCI space, but before trying to assign unassigned pci res.
246 */ 246 */
247 ioapic_insert_resources(); 247 ioapic_insert_resources();
@@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
304 /* 304 /*
305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
306 * To avoid attribute conflicts, request UC MINUS here 306 * To avoid attribute conflicts, request UC MINUS here
307 * aswell. 307 * as well.
308 */ 308 */
309 prot |= _PAGE_CACHE_UC_MINUS; 309 prot |= _PAGE_CACHE_UC_MINUS;
310 310
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 87e6c8323117..8201165bae28 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
597 return 1; 597 return 1;
598 } 598 }
599 599
600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && 600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
601 (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { 601 device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
602 r->name = "PIIX/ICH"; 606 r->name = "PIIX/ICH";
603 r->get = pirq_piix_get; 607 r->get = pirq_piix_get;
604 r->set = pirq_piix_set; 608 r->set = pirq_piix_set;
605 return 1; 609 return 1;
606 } 610 }
607 611
608 if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
609 (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
610 r->name = "PIIX/ICH";
611 r->get = pirq_piix_get;
612 r->set = pirq_piix_set;
613 return 1;
614 }
615 return 0; 612 return 0;
616} 613}
617 614
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a07d09f..e37b407a0ee8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
20#include <asm/xen/pci.h> 20#include <asm/xen/pci.h>
21 21
22#ifdef CONFIG_ACPI 22#ifdef CONFIG_ACPI
23static int xen_hvm_register_pirq(u32 gsi, int triggering) 23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
24 int trigger, int polarity)
24{ 25{
25 int rc, irq; 26 int rc, irq;
26 struct physdev_map_pirq map_irq; 27 struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
41 return -1; 42 return -1;
42 } 43 }
43 44
44 if (triggering == ACPI_EDGE_SENSITIVE) { 45 if (trigger == ACPI_EDGE_SENSITIVE) {
45 shareable = 0; 46 shareable = 0;
46 name = "ioapic-edge"; 47 name = "ioapic-edge";
47 } else { 48 } else {
@@ -49,18 +50,12 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
49 name = "ioapic-level"; 50 name = "ioapic-level";
50 } 51 }
51 52
52 irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); 53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
53 54
54 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
55 56
56 return irq; 57 return irq;
57} 58}
58
59static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
60 int trigger, int polarity)
61{
62 return xen_hvm_register_pirq(gsi, trigger);
63}
64#endif 59#endif
65 60
66#if defined(CONFIG_PCI_MSI) 61#if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
91 86
92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 87static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
93{ 88{
94 int irq, pirq, ret = 0; 89 int irq, pirq;
95 struct msi_desc *msidesc; 90 struct msi_desc *msidesc;
96 struct msi_msg msg; 91 struct msi_msg msg;
97 92
@@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
99 __read_msi_msg(msidesc, &msg); 94 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | 95 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); 96 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { 97 if (msg.data != XEN_PIRQ_MSI_DATA ||
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 98 xen_irq_from_pirq(pirq) < 0) {
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); 99 pirq = xen_allocate_pirq_msi(dev, msidesc);
105 if (irq < 0) 100 if (pirq < 0)
106 goto error; 101 goto error;
107 ret = set_irq_msi(irq, msidesc); 102 xen_msi_compose_msg(dev, pirq, &msg);
108 if (ret < 0) 103 __write_msi_msg(msidesc, &msg);
109 goto error_while; 104 dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" 105 } else {
111 " pirq=%d\n", irq, pirq); 106 dev_dbg(&dev->dev,
112 return 0; 107 "xen: msi already bound to pirq=%d\n", pirq);
113 } 108 }
114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); 110 (type == PCI_CAP_ID_MSIX) ?
116 if (irq < 0 || pirq < 0) 111 "msi-x" : "msi");
112 if (irq < 0)
117 goto error; 113 goto error;
118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 114 dev_dbg(&dev->dev,
119 xen_msi_compose_msg(dev, pirq, &msg); 115 "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
120 ret = set_irq_msi(irq, msidesc);
121 if (ret < 0)
122 goto error_while;
123 write_msi_msg(irq, &msg);
124 } 116 }
125 return 0; 117 return 0;
126 118
127error_while:
128 unbind_from_irqhandler(irq, NULL);
129error: 119error:
130 if (ret == -ENODEV) 120 dev_err(&dev->dev,
131 dev_err(&dev->dev, "Xen PCI frontend has not registered" \ 121 "Xen PCI frontend has not registered MSI/MSI-X support!\n");
132 " MSI/MSI-X support!\n"); 122 return -ENODEV;
133
134 return ret;
135} 123}
136 124
137/* 125/*
@@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
150 return -ENOMEM; 138 return -ENOMEM;
151 139
152 if (type == PCI_CAP_ID_MSIX) 140 if (type == PCI_CAP_ID_MSIX)
153 ret = xen_pci_frontend_enable_msix(dev, &v, nvec); 141 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
154 else 142 else
155 ret = xen_pci_frontend_enable_msi(dev, &v); 143 ret = xen_pci_frontend_enable_msi(dev, v);
156 if (ret) 144 if (ret)
157 goto error; 145 goto error;
158 i = 0; 146 i = 0;
159 list_for_each_entry(msidesc, &dev->msi_list, list) { 147 list_for_each_entry(msidesc, &dev->msi_list, list) {
160 irq = xen_allocate_pirq(v[i], 0, /* not sharable */ 148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
161 (type == PCI_CAP_ID_MSIX) ? 149 (type == PCI_CAP_ID_MSIX) ?
162 "pcifront-msi-x" : "pcifront-msi"); 150 "pcifront-msi-x" :
163 if (irq < 0) { 151 "pcifront-msi");
164 ret = -1; 152 if (irq < 0)
165 goto free; 153 goto free;
166 }
167
168 ret = set_irq_msi(irq, msidesc);
169 if (ret)
170 goto error_while;
171 i++; 154 i++;
172 } 155 }
173 kfree(v); 156 kfree(v);
174 return 0; 157 return 0;
175 158
176error_while:
177 unbind_from_irqhandler(irq, NULL);
178error: 159error:
179 if (ret == -ENODEV) 160 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
180 dev_err(&dev->dev, "Xen PCI frontend has not registered" \
181 " MSI/MSI-X support!\n");
182free: 161free:
183 kfree(v); 162 kfree(v);
184 return ret; 163 return ret;
@@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
193 xen_pci_frontend_disable_msix(dev); 172 xen_pci_frontend_disable_msix(dev);
194 else 173 else
195 xen_pci_frontend_disable_msi(dev); 174 xen_pci_frontend_disable_msi(dev);
175
176 /* Free the IRQ's and the msidesc using the generic code. */
177 default_teardown_msi_irqs(dev);
196} 178}
197 179
198static void xen_teardown_msi_irq(unsigned int irq) 180static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +182,91 @@ static void xen_teardown_msi_irq(unsigned int irq)
200 xen_destroy_irq(irq); 182 xen_destroy_irq(irq);
201} 183}
202 184
185#ifdef CONFIG_XEN_DOM0
203static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 186static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
204{ 187{
205 int irq, ret; 188 int ret = 0;
206 struct msi_desc *msidesc; 189 struct msi_desc *msidesc;
207 190
208 list_for_each_entry(msidesc, &dev->msi_list, list) { 191 list_for_each_entry(msidesc, &dev->msi_list, list) {
209 irq = xen_create_msi_irq(dev, msidesc, type); 192 struct physdev_map_pirq map_irq;
210 if (irq < 0)
211 return -1;
212 193
213 ret = set_irq_msi(irq, msidesc); 194 memset(&map_irq, 0, sizeof(map_irq));
214 if (ret) 195 map_irq.domid = DOMID_SELF;
215 goto error; 196 map_irq.type = MAP_PIRQ_TYPE_MSI;
216 } 197 map_irq.index = -1;
217 return 0; 198 map_irq.pirq = -1;
199 map_irq.bus = dev->bus->number;
200 map_irq.devfn = dev->devfn;
218 201
219error: 202 if (type == PCI_CAP_ID_MSIX) {
220 xen_destroy_irq(irq); 203 int pos;
204 u32 table_offset, bir;
205
206 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
207
208 pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
209 &table_offset);
210 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
211
212 map_irq.table_base = pci_resource_start(dev, bir);
213 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
214 }
215
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
219 goto out;
220 }
221
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi");
226 if (ret < 0)
227 goto out;
228 }
229 ret = 0;
230out:
221 return ret; 231 return ret;
222} 232}
223#endif 233#endif
234#endif
224 235
225static int xen_pcifront_enable_irq(struct pci_dev *dev) 236static int xen_pcifront_enable_irq(struct pci_dev *dev)
226{ 237{
227 int rc; 238 int rc;
228 int share = 1; 239 int share = 1;
240 int pirq;
241 u8 gsi;
229 242
230 dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); 243 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
244 if (rc < 0) {
245 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
246 rc);
247 return rc;
248 }
231 249
232 if (dev->irq < 0) 250 rc = xen_allocate_pirq_gsi(gsi);
233 return -EINVAL; 251 if (rc < 0) {
252 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
253 gsi, rc);
254 return rc;
255 }
256 pirq = rc;
234 257
235 if (dev->irq < NR_IRQS_LEGACY) 258 if (gsi < NR_IRQS_LEGACY)
236 share = 0; 259 share = 0;
237 260
238 rc = xen_allocate_pirq(dev->irq, share, "pcifront"); 261 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
239 if (rc < 0) { 262 if (rc < 0) {
240 dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", 263 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
241 dev->irq, rc); 264 gsi, pirq, rc);
242 return rc; 265 return rc;
243 } 266 }
267
268 dev->irq = rc;
269 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
244 return 0; 270 return 0;
245} 271}
246 272
@@ -292,7 +318,7 @@ int __init pci_xen_hvm_init(void)
292#ifdef CONFIG_XEN_DOM0 318#ifdef CONFIG_XEN_DOM0
293static int xen_register_pirq(u32 gsi, int triggering) 319static int xen_register_pirq(u32 gsi, int triggering)
294{ 320{
295 int rc, irq; 321 int rc, pirq, irq = -1;
296 struct physdev_map_pirq map_irq; 322 struct physdev_map_pirq map_irq;
297 int shareable = 0; 323 int shareable = 0;
298 char *name; 324 char *name;
@@ -308,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering)
308 name = "ioapic-level"; 334 name = "ioapic-level";
309 } 335 }
310 336
311 irq = xen_allocate_pirq(gsi, shareable, name); 337 pirq = xen_allocate_pirq_gsi(gsi);
312 338 if (pirq < 0)
313 printk(KERN_DEBUG "xen: --> irq=%d\n", irq); 339 goto out;
314 340
341 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
315 if (irq < 0) 342 if (irq < 0)
316 goto out; 343 goto out;
317 344
345 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
346
318 map_irq.domid = DOMID_SELF; 347 map_irq.domid = DOMID_SELF;
319 map_irq.type = MAP_PIRQ_TYPE_GSI; 348 map_irq.type = MAP_PIRQ_TYPE_GSI;
320 map_irq.index = gsi; 349 map_irq.index = gsi;
321 map_irq.pirq = irq; 350 map_irq.pirq = pirq;
322 351
323 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 352 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
324 if (rc) { 353 if (rc) {
@@ -405,13 +434,18 @@ static int __init pci_xen_initial_domain(void)
405 434
406void __init xen_setup_pirqs(void) 435void __init xen_setup_pirqs(void)
407{ 436{
408 int irq; 437 int pirq, irq;
409 438
410 pci_xen_initial_domain(); 439 pci_xen_initial_domain();
411 440
412 if (0 == nr_ioapics) { 441 if (0 == nr_ioapics) {
413 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) 442 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
414 xen_allocate_pirq(irq, 0, "xt-pic"); 443 pirq = xen_allocate_pirq_gsi(irq);
444 if (WARN(pirq < 0,
445 "Could not allocate PIRQ for legacy interrupt\n"))
446 break;
447 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
448 }
415 return; 449 return;
416 } 450 }
417 451
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index cd6f184c3b3f..28071bb31db7 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -16,21 +16,19 @@
16#include <linux/serial_8250.h> 16#include <linux/serial_8250.h>
17 17
18#include <asm/ce4100.h> 18#include <asm/ce4100.h>
19#include <asm/prom.h>
19#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/i8259.h>
20#include <asm/io.h> 22#include <asm/io.h>
23#include <asm/io_apic.h>
21 24
22static int ce4100_i8042_detect(void) 25static int ce4100_i8042_detect(void)
23{ 26{
24 return 0; 27 return 0;
25} 28}
26 29
27static void __init sdv_find_smp_config(void)
28{
29}
30
31#ifdef CONFIG_SERIAL_8250 30#ifdef CONFIG_SERIAL_8250
32 31
33
34static unsigned int mem_serial_in(struct uart_port *p, int offset) 32static unsigned int mem_serial_in(struct uart_port *p, int offset)
35{ 33{
36 offset = offset << p->regshift; 34 offset = offset << p->regshift;
@@ -119,6 +117,15 @@ static void __init sdv_arch_setup(void)
119 sdv_serial_fixup(); 117 sdv_serial_fixup();
120} 118}
121 119
120#ifdef CONFIG_X86_IO_APIC
121static void __cpuinit sdv_pci_init(void)
122{
123 x86_of_pci_init();
124 /* We can't set this earlier, because we need to calibrate the timer */
125 legacy_pic = &null_legacy_pic;
126}
127#endif
128
122/* 129/*
123 * CE4100 specific x86_init function overrides and early setup 130 * CE4100 specific x86_init function overrides and early setup
124 * calls. 131 * calls.
@@ -129,6 +136,11 @@ void __init x86_ce4100_early_setup(void)
129 x86_platform.i8042_detect = ce4100_i8042_detect; 136 x86_platform.i8042_detect = ce4100_i8042_detect;
130 x86_init.resources.probe_roms = x86_init_noop; 137 x86_init.resources.probe_roms = x86_init_noop;
131 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 138 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
132 x86_init.mpparse.find_smp_config = sdv_find_smp_config; 139 x86_init.mpparse.find_smp_config = x86_init_noop;
133 x86_init.pci.init = ce4100_pci_init; 140 x86_init.pci.init = ce4100_pci_init;
141
142#ifdef CONFIG_X86_IO_APIC
143 x86_init.pci.init_irq = sdv_pci_init;
144 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
145#endif
134} 146}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 000000000000..dc701ea58546
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,428 @@
1/*
2 * CE4100 on Falcon Falls
3 *
4 * (c) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; version 2 of the License.
9 */
10/dts-v1/;
11/ {
12 model = "intel,falconfalls";
13 compatible = "intel,falconfalls";
14 #address-cells = <1>;
15 #size-cells = <1>;
16
17 cpus {
18 #address-cells = <1>;
19 #size-cells = <0>;
20
21 cpu@0 {
22 device_type = "cpu";
23 compatible = "intel,ce4100";
24 reg = <0>;
25 lapic = <&lapic0>;
26 };
27 };
28
29 soc@0 {
30 #address-cells = <1>;
31 #size-cells = <1>;
32 compatible = "intel,ce4100-cp";
33 ranges;
34
35 ioapic1: interrupt-controller@fec00000 {
36 #interrupt-cells = <2>;
37 compatible = "intel,ce4100-ioapic";
38 interrupt-controller;
39 reg = <0xfec00000 0x1000>;
40 };
41
42 timer@fed00000 {
43 compatible = "intel,ce4100-hpet";
44 reg = <0xfed00000 0x200>;
45 };
46
47 lapic0: interrupt-controller@fee00000 {
48 compatible = "intel,ce4100-lapic";
49 reg = <0xfee00000 0x1000>;
50 };
51
52 pci@3fc {
53 #address-cells = <3>;
54 #size-cells = <2>;
55 compatible = "intel,ce4100-pci", "pci";
56 device_type = "pci";
57 bus-range = <0 0>;
58 ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
59 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
60 0x0000000 0 0x0 0x0 0 0x100>;
61
62 /* Secondary IO-APIC */
63 ioapic2: interrupt-controller@0,1 {
64 #interrupt-cells = <2>;
65 compatible = "intel,ce4100-ioapic";
66 interrupt-controller;
67 reg = <0x100 0x0 0x0 0x0 0x0>;
68 assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
69 };
70
71 pci@1,0 {
72 #address-cells = <3>;
73 #size-cells = <2>;
74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci";
76 bus-range = <1 1>;
77 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
78
79 interrupt-parent = <&ioapic2>;
80
81 display@2,0 {
82 compatible = "pci8086,2e5b.2",
83 "pci8086,2e5b",
84 "pciclass038000",
85 "pciclass0380";
86
87 reg = <0x11000 0x0 0x0 0x0 0x0>;
88 interrupts = <0 1>;
89 };
90
91 multimedia@3,0 {
92 compatible = "pci8086,2e5c.2",
93 "pci8086,2e5c",
94 "pciclass048000",
95 "pciclass0480";
96
97 reg = <0x11800 0x0 0x0 0x0 0x0>;
98 interrupts = <2 1>;
99 };
100
101 multimedia@4,0 {
102 compatible = "pci8086,2e5d.2",
103 "pci8086,2e5d",
104 "pciclass048000",
105 "pciclass0480";
106
107 reg = <0x12000 0x0 0x0 0x0 0x0>;
108 interrupts = <4 1>;
109 };
110
111 multimedia@4,1 {
112 compatible = "pci8086,2e5e.2",
113 "pci8086,2e5e",
114 "pciclass048000",
115 "pciclass0480";
116
117 reg = <0x12100 0x0 0x0 0x0 0x0>;
118 interrupts = <5 1>;
119 };
120
121 sound@6,0 {
122 compatible = "pci8086,2e5f.2",
123 "pci8086,2e5f",
124 "pciclass040100",
125 "pciclass0401";
126
127 reg = <0x13000 0x0 0x0 0x0 0x0>;
128 interrupts = <6 1>;
129 };
130
131 sound@6,1 {
132 compatible = "pci8086,2e5f.2",
133 "pci8086,2e5f",
134 "pciclass040100",
135 "pciclass0401";
136
137 reg = <0x13100 0x0 0x0 0x0 0x0>;
138 interrupts = <7 1>;
139 };
140
141 sound@6,2 {
142 compatible = "pci8086,2e60.2",
143 "pci8086,2e60",
144 "pciclass040100",
145 "pciclass0401";
146
147 reg = <0x13200 0x0 0x0 0x0 0x0>;
148 interrupts = <8 1>;
149 };
150
151 display@8,0 {
152 compatible = "pci8086,2e61.2",
153 "pci8086,2e61",
154 "pciclass038000",
155 "pciclass0380";
156
157 reg = <0x14000 0x0 0x0 0x0 0x0>;
158 interrupts = <9 1>;
159 };
160
161 display@8,1 {
162 compatible = "pci8086,2e62.2",
163 "pci8086,2e62",
164 "pciclass038000",
165 "pciclass0380";
166
167 reg = <0x14100 0x0 0x0 0x0 0x0>;
168 interrupts = <10 1>;
169 };
170
171 multimedia@8,2 {
172 compatible = "pci8086,2e63.2",
173 "pci8086,2e63",
174 "pciclass048000",
175 "pciclass0480";
176
177 reg = <0x14200 0x0 0x0 0x0 0x0>;
178 interrupts = <11 1>;
179 };
180
181 entertainment-encryption@9,0 {
182 compatible = "pci8086,2e64.2",
183 "pci8086,2e64",
184 "pciclass101000",
185 "pciclass1010";
186
187 reg = <0x14800 0x0 0x0 0x0 0x0>;
188 interrupts = <12 1>;
189 };
190
191 localbus@a,0 {
192 compatible = "pci8086,2e65.2",
193 "pci8086,2e65",
194 "pciclassff0000",
195 "pciclassff00";
196
197 reg = <0x15000 0x0 0x0 0x0 0x0>;
198 };
199
200 serial@b,0 {
201 compatible = "pci8086,2e66.2",
202 "pci8086,2e66",
203 "pciclass070003",
204 "pciclass0700";
205
206 reg = <0x15800 0x0 0x0 0x0 0x0>;
207 interrupts = <14 1>;
208 };
209
210 gpio@b,1 {
211 compatible = "pci8086,2e67.2",
212 "pci8086,2e67",
213 "pciclassff0000",
214 "pciclassff00";
215
216 #gpio-cells = <2>;
217 reg = <0x15900 0x0 0x0 0x0 0x0>;
218 interrupts = <15 1>;
219 gpio-controller;
220 };
221
222 i2c-controller@b,2 {
223 #address-cells = <2>;
224 #size-cells = <1>;
225 compatible = "pci8086,2e68.2",
226 "pci8086,2e68",
227 "pciclass,ff0000",
228 "pciclass,ff00";
229
230 reg = <0x15a00 0x0 0x0 0x0 0x0>;
231 interrupts = <16 1>;
232 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
233 1 0 0x02000000 0 0xdffe0600 0x100
234 2 0 0x02000000 0 0xdffe0700 0x100>;
235
236 i2c@0 {
237 #address-cells = <1>;
238 #size-cells = <0>;
239 compatible = "intel,ce4100-i2c-controller";
240 reg = <0 0 0x100>;
241 };
242
243 i2c@1 {
244 #address-cells = <1>;
245 #size-cells = <0>;
246 compatible = "intel,ce4100-i2c-controller";
247 reg = <1 0 0x100>;
248
249 gpio@26 {
250 #gpio-cells = <2>;
251 compatible = "ti,pcf8575";
252 reg = <0x26>;
253 gpio-controller;
254 };
255 };
256
257 i2c@2 {
258 #address-cells = <1>;
259 #size-cells = <0>;
260 compatible = "intel,ce4100-i2c-controller";
261 reg = <2 0 0x100>;
262
263 gpio@26 {
264 #gpio-cells = <2>;
265 compatible = "ti,pcf8575";
266 reg = <0x26>;
267 gpio-controller;
268 };
269 };
270 };
271
272 smard-card@b,3 {
273 compatible = "pci8086,2e69.2",
274 "pci8086,2e69",
275 "pciclass070500",
276 "pciclass0705";
277
278 reg = <0x15b00 0x0 0x0 0x0 0x0>;
279 interrupts = <15 1>;
280 };
281
282 spi-controller@b,4 {
283 #address-cells = <1>;
284 #size-cells = <0>;
285 compatible =
286 "pci8086,2e6a.2",
287 "pci8086,2e6a",
288 "pciclass,ff0000",
289 "pciclass,ff00";
290
291 reg = <0x15c00 0x0 0x0 0x0 0x0>;
292 interrupts = <15 1>;
293
294 dac@0 {
295 compatible = "ti,pcm1755";
296 reg = <0>;
297 spi-max-frequency = <115200>;
298 };
299
300 dac@1 {
301 compatible = "ti,pcm1609a";
302 reg = <1>;
303 spi-max-frequency = <115200>;
304 };
305
306 eeprom@2 {
307 compatible = "atmel,at93c46";
308 reg = <2>;
309 spi-max-frequency = <115200>;
310 };
311 };
312
313 multimedia@b,7 {
314 compatible = "pci8086,2e6d.2",
315 "pci8086,2e6d",
316 "pciclassff0000",
317 "pciclassff00";
318
319 reg = <0x15f00 0x0 0x0 0x0 0x0>;
320 };
321
322 ethernet@c,0 {
323 compatible = "pci8086,2e6e.2",
324 "pci8086,2e6e",
325 "pciclass020000",
326 "pciclass0200";
327
328 reg = <0x16000 0x0 0x0 0x0 0x0>;
329 interrupts = <21 1>;
330 };
331
332 clock@c,1 {
333 compatible = "pci8086,2e6f.2",
334 "pci8086,2e6f",
335 "pciclassff0000",
336 "pciclassff00";
337
338 reg = <0x16100 0x0 0x0 0x0 0x0>;
339 interrupts = <3 1>;
340 };
341
342 usb@d,0 {
343 compatible = "pci8086,2e70.2",
344 "pci8086,2e70",
345 "pciclass0c0320",
346 "pciclass0c03";
347
348 reg = <0x16800 0x0 0x0 0x0 0x0>;
349 interrupts = <22 3>;
350 };
351
352 usb@d,1 {
353 compatible = "pci8086,2e70.2",
354 "pci8086,2e70",
355 "pciclass0c0320",
356 "pciclass0c03";
357
358 reg = <0x16900 0x0 0x0 0x0 0x0>;
359 interrupts = <22 3>;
360 };
361
362 sata@e,0 {
363 compatible = "pci8086,2e71.0",
364 "pci8086,2e71",
365 "pciclass010601",
366 "pciclass0106";
367
368 reg = <0x17000 0x0 0x0 0x0 0x0>;
369 interrupts = <23 3>;
370 };
371
372 flash@f,0 {
373 compatible = "pci8086,701.1",
374 "pci8086,701",
375 "pciclass050100",
376 "pciclass0501";
377
378 reg = <0x17800 0x0 0x0 0x0 0x0>;
379 interrupts = <13 1>;
380 };
381
382 entertainment-encryption@10,0 {
383 compatible = "pci8086,702.1",
384 "pci8086,702",
385 "pciclass101000",
386 "pciclass1010";
387
388 reg = <0x18000 0x0 0x0 0x0 0x0>;
389 };
390
391 co-processor@11,0 {
392 compatible = "pci8086,703.1",
393 "pci8086,703",
394 "pciclass0b4000",
395 "pciclass0b40";
396
397 reg = <0x18800 0x0 0x0 0x0 0x0>;
398 interrupts = <1 1>;
399 };
400
401 multimedia@12,0 {
402 compatible = "pci8086,704.0",
403 "pci8086,704",
404 "pciclass048000",
405 "pciclass0480";
406
407 reg = <0x19000 0x0 0x0 0x0 0x0>;
408 };
409 };
410
411 isa@1f,0 {
412 #address-cells = <2>;
413 #size-cells = <1>;
414 compatible = "isa";
415 ranges = <1 0 0 0 0 0x100>;
416
417 rtc@70 {
418 compatible = "intel,ce4100-rtc", "motorola,mc146818";
419 interrupts = <8 3>;
420 interrupt-parent = <&ioapic1>;
421 ctrl-reg = <2>;
422 freq-reg = <0x26>;
423 reg = <1 0x70 2>;
424 };
425 };
426 };
427 };
428};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index ea6529e93c6f..5c0207bf959b 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -31,6 +31,7 @@
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/io_apic.h> 32#include <asm/io_apic.h>
33#include <asm/mrst.h> 33#include <asm/mrst.h>
34#include <asm/mrst-vrtc.h>
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/i8259.h> 36#include <asm/i8259.h>
36#include <asm/intel_scu_ipc.h> 37#include <asm/intel_scu_ipc.h>
@@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void)
268 269
269 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 270 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
270 x86_platform.i8042_detect = mrst_i8042_detect; 271 x86_platform.i8042_detect = mrst_i8042_detect;
272 x86_init.timers.wallclock_init = mrst_rtc_init;
271 x86_init.pci.init = pci_mrst_init; 273 x86_init.pci.init = pci_mrst_init;
272 x86_init.pci.fixup_irqs = x86_init_noop; 274 x86_init.pci.fixup_irqs = x86_init_noop;
273 275
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 32cd7edd71a0..04cf645feb92 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,22 +100,14 @@ int vrtc_set_mmss(unsigned long nowtime)
100 100
101void __init mrst_rtc_init(void) 101void __init mrst_rtc_init(void)
102{ 102{
103 unsigned long rtc_paddr; 103 unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr;
104 void __iomem *virt_base;
105 104
106 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 105 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
107 if (!sfi_mrtc_num) 106 if (!sfi_mrtc_num || !vrtc_paddr)
108 return; 107 return;
109 108
110 rtc_paddr = sfi_mrtc_array[0].phys_addr; 109 vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
111 110 vrtc_paddr);
112 /* vRTC's register address may not be page aligned */
113 set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
114
115 virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
116 virt_base += rtc_paddr & ~PAGE_MASK;
117 vrtc_virt_base = virt_base;
118
119 x86_platform.get_wallclock = vrtc_get_time; 111 x86_platform.get_wallclock = vrtc_get_time;
120 x86_platform.set_wallclock = vrtc_set_mmss; 112 x86_platform.set_wallclock = vrtc_set_mmss;
121} 113}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e797428b163b..c2a8cab65e5d 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o 4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460917d5..374a05d8ad22 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
131 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
132{ 132{
133 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
134 struct irq_cfg *cfg = get_irq_chip_data(irq); 134 struct irq_cfg *cfg = irq_get_chip_data(irq);
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
@@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
148 else 148 else
149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
150 150
151 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
152 irq_name); 152 irq_name);
153 153
154 mmr_value = 0; 154 mmr_value = 0;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 632037671746..fe4cf8294878 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -569,11 +569,13 @@ out_unlock:
569static struct irqaction master_action = { 569static struct irqaction master_action = {
570 .handler = piix4_master_intr, 570 .handler = piix4_master_intr,
571 .name = "PIIX4-8259", 571 .name = "PIIX4-8259",
572 .flags = IRQF_NO_THREAD,
572}; 573};
573 574
574static struct irqaction cascade_action = { 575static struct irqaction cascade_action = {
575 .handler = no_action, 576 .handler = no_action,
576 .name = "cascade", 577 .name = "cascade",
578 .flags = IRQF_NO_THREAD,
577}; 579};
578 580
579static inline void set_piix4_virtual_irq_type(void) 581static inline void set_piix4_virtual_irq_type(void)
@@ -606,7 +608,7 @@ static void __init visws_pre_intr_init(void)
606 chip = &cobalt_irq_type; 608 chip = &cobalt_irq_type;
607 609
608 if (chip) 610 if (chip)
609 set_irq_chip(i, chip); 611 irq_set_chip(i, chip);
610 } 612 }
611 613
612 setup_irq(CO_IRQ_8259, &master_action); 614 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892e4bc3..1c7121ba18ff 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -38,7 +38,7 @@ config XEN_MAX_DOMAIN_MEMORY
38 38
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN && PM 41 depends on XEN
42 default y 42 default y
43 43
44config XEN_DEBUG_FS 44config XEN_DEBUG_FS
@@ -48,3 +48,11 @@ config XEN_DEBUG_FS
48 help 48 help
49 Enable statistics output and various tuning options in debugfs. 49 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 50 Enabling this option may incur a significant performance overhead.
51
52config XEN_DEBUG
53 bool "Enable Xen debug checks"
54 depends on XEN
55 default n
56 help
57 Enable various WARN_ON checks in the Xen MMU code.
58 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45fb..49dbd78ec3cb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1284 1284
1285 xen_setup_features(); 1285 xen_setup_features();
1286 1286
1287 pv_info = xen_info; 1287 pv_info.name = "Xen HVM";
1288 pv_info.kernel_rpl = 0;
1289 1288
1290 xen_domain_type = XEN_HVM_DOMAIN; 1289 xen_domain_type = XEN_HVM_DOMAIN;
1291 1290
1292 return 0; 1291 return 0;
1293} 1292}
1294 1293
1295void xen_hvm_init_shared_info(void) 1294void __ref xen_hvm_init_shared_info(void)
1296{ 1295{
1297 int cpu; 1296 int cpu;
1298 struct xen_add_to_physmap xatp; 1297 struct xen_add_to_physmap xatp;
@@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1331 switch (action) { 1330 switch (action) {
1332 case CPU_UP_PREPARE: 1331 case CPU_UP_PREPARE:
1333 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1332 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1333 if (xen_have_vector_callback)
1334 xen_init_lock_cpu(cpu);
1334 break; 1335 break;
1335 default: 1336 default:
1336 break; 1337 break;
@@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
1355 1356
1356 if (xen_feature(XENFEAT_hvm_callback_vector)) 1357 if (xen_feature(XENFEAT_hvm_callback_vector))
1357 xen_have_vector_callback = 1; 1358 xen_have_vector_callback = 1;
1359 xen_hvm_smp_init();
1358 register_cpu_notifier(&xen_hvm_cpu_notifier); 1360 register_cpu_notifier(&xen_hvm_cpu_notifier);
1359 xen_unplug_emulated_devices(); 1361 xen_unplug_emulated_devices();
1360 have_vcpu_info_placement = 0; 1362 have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f6089421147a..c82df6c9c0f0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h>
49 50
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -78,8 +79,7 @@
78 79
79/* 80/*
80 * Protects atomic reservation decrease/increase against concurrent increases. 81 * Protects atomic reservation decrease/increase against concurrent increases.
81 * Also protects non-atomic updates of current_pages and driver_pages, and 82 * Also protects non-atomic updates of current_pages and balloon lists.
82 * balloon lists.
83 */ 83 */
84DEFINE_SPINLOCK(xen_reservation_lock); 84DEFINE_SPINLOCK(xen_reservation_lock);
85 85
@@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
416 if (val & _PAGE_PRESENT) { 416 if (val & _PAGE_PRESENT) {
417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 pteval_t flags = val & PTE_FLAGS_MASK; 418 pteval_t flags = val & PTE_FLAGS_MASK;
419 unsigned long mfn = pfn_to_mfn(pfn); 419 unsigned long mfn;
420 420
421 if (!xen_feature(XENFEAT_auto_translated_physmap))
422 mfn = get_phys_to_machine(pfn);
423 else
424 mfn = pfn;
421 /* 425 /*
422 * If there's no mfn for the pfn, then just create an 426 * If there's no mfn for the pfn, then just create an
423 * empty non-present pte. Unfortunately this loses 427 * empty non-present pte. Unfortunately this loses
@@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
427 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 431 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 mfn = 0; 432 mfn = 0;
429 flags = 0; 433 flags = 0;
434 } else {
435 /*
436 * Paramount to do this test _after_ the
437 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
438 * IDENTITY_FRAME_BIT resolves to true.
439 */
440 mfn &= ~FOREIGN_FRAME_BIT;
441 if (mfn & IDENTITY_FRAME_BIT) {
442 mfn &= ~IDENTITY_FRAME_BIT;
443 flags |= _PAGE_IOMAP;
444 }
430 } 445 }
431
432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 446 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 } 447 }
434 448
@@ -532,6 +546,41 @@ pte_t xen_make_pte(pteval_t pte)
532} 546}
533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 547PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 548
549#ifdef CONFIG_XEN_DEBUG
550pte_t xen_make_pte_debug(pteval_t pte)
551{
552 phys_addr_t addr = (pte & PTE_PFN_MASK);
553 phys_addr_t other_addr;
554 bool io_page = false;
555 pte_t _pte;
556
557 if (pte & _PAGE_IOMAP)
558 io_page = true;
559
560 _pte = xen_make_pte(pte);
561
562 if (!addr)
563 return _pte;
564
565 if (io_page &&
566 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
567 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
568 WARN(addr != other_addr,
569 "0x%lx is using VM_IO, but it is 0x%lx!\n",
570 (unsigned long)addr, (unsigned long)other_addr);
571 } else {
572 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
573 other_addr = (_pte.pte & PTE_PFN_MASK);
574 WARN((addr == other_addr) && (!io_page) && (!iomap_set),
575 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
576 (unsigned long)addr);
577 }
578
579 return _pte;
580}
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif
583
535pgd_t xen_make_pgd(pgdval_t pgd) 584pgd_t xen_make_pgd(pgdval_t pgd)
536{ 585{
537 pgd = pte_pfn_to_mfn(pgd); 586 pgd = pte_pfn_to_mfn(pgd);
@@ -1438,10 +1487,12 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1438 /* 1487 /*
1439 * If the new pfn is within the range of the newly allocated 1488 * If the new pfn is within the range of the newly allocated
1440 * kernel pagetable, and it isn't being mapped into an 1489 * kernel pagetable, and it isn't being mapped into an
1441 * early_ioremap fixmap slot, make sure it is RO. 1490 * early_ioremap fixmap slot as a freshly allocated page, make sure
1491 * it is RO.
1442 */ 1492 */
1443 if (!is_early_ioremap_ptep(ptep) && 1493 if (((!is_early_ioremap_ptep(ptep) &&
1444 pfn >= e820_table_start && pfn < e820_table_end) 1494 pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
1495 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1445 pte = pte_wrprotect(pte); 1496 pte = pte_wrprotect(pte);
1446 1497
1447 return pte; 1498 return pte;
@@ -1651,9 +1702,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1651 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1702 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1652 pte_t pte; 1703 pte_t pte;
1653 1704
1654 if (pfn > max_pfn_mapped)
1655 max_pfn_mapped = pfn;
1656
1657 if (!pte_none(pte_page[pteidx])) 1705 if (!pte_none(pte_page[pteidx]))
1658 continue; 1706 continue;
1659 1707
@@ -1695,7 +1743,7 @@ static void convert_pfn_mfn(void *v)
1695} 1743}
1696 1744
1697/* 1745/*
1698 * Set up the inital kernel pagetable. 1746 * Set up the initial kernel pagetable.
1699 * 1747 *
1700 * We can construct this by grafting the Xen provided pagetable into 1748 * We can construct this by grafting the Xen provided pagetable into
1701 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1749 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1711,6 +1759,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1711 pud_t *l3; 1759 pud_t *l3;
1712 pmd_t *l2; 1760 pmd_t *l2;
1713 1761
1762 /* max_pfn_mapped is the last pfn mapped in the initial memory
1763 * mappings. Considering that on Xen after the kernel mappings we
1764 * have the mappings of some pages that don't exist in pfn space, we
1765 * set max_pfn_mapped to the last real pfn mapped. */
1766 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1767
1714 /* Zap identity mapping */ 1768 /* Zap identity mapping */
1715 init_level4_pgt[0] = __pgd(0); 1769 init_level4_pgt[0] = __pgd(0);
1716 1770
@@ -1815,9 +1869,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1815 initial_kernel_pmd = 1869 initial_kernel_pmd =
1816 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1870 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1817 1871
1818 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1872 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1819 xen_start_info->nr_pt_frames * PAGE_SIZE +
1820 512*1024);
1821 1873
1822 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1874 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1823 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1875 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1940,6 +1992,9 @@ __init void xen_ident_map_ISA(void)
1940 1992
1941static __init void xen_post_allocator_init(void) 1993static __init void xen_post_allocator_init(void)
1942{ 1994{
1995#ifdef CONFIG_XEN_DEBUG
1996 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1997#endif
1943 pv_mmu_ops.set_pte = xen_set_pte; 1998 pv_mmu_ops.set_pte = xen_set_pte;
1944 pv_mmu_ops.set_pmd = xen_set_pmd; 1999 pv_mmu_ops.set_pmd = xen_set_pmd;
1945 pv_mmu_ops.set_pud = xen_set_pud; 2000 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2072,7 +2127,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2072 in_frames[i] = virt_to_mfn(vaddr); 2127 in_frames[i] = virt_to_mfn(vaddr);
2073 2128
2074 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2129 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2075 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2130 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2076 2131
2077 if (out_frames) 2132 if (out_frames)
2078 out_frames[i] = virt_to_pfn(vaddr); 2133 out_frames[i] = virt_to_pfn(vaddr);
@@ -2351,6 +2406,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2351 2406
2352#ifdef CONFIG_XEN_DEBUG_FS 2407#ifdef CONFIG_XEN_DEBUG_FS
2353 2408
2409static int p2m_dump_open(struct inode *inode, struct file *filp)
2410{
2411 return single_open(filp, p2m_dump_show, NULL);
2412}
2413
2414static const struct file_operations p2m_dump_fops = {
2415 .open = p2m_dump_open,
2416 .read = seq_read,
2417 .llseek = seq_lseek,
2418 .release = single_release,
2419};
2420
2354static struct dentry *d_mmu_debug; 2421static struct dentry *d_mmu_debug;
2355 2422
2356static int __init xen_mmu_debugfs(void) 2423static int __init xen_mmu_debugfs(void)
@@ -2406,6 +2473,7 @@ static int __init xen_mmu_debugfs(void)
2406 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, 2473 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2407 &mmu_stats.prot_commit_batched); 2474 &mmu_stats.prot_commit_batched);
2408 2475
2476 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2409 return 0; 2477 return 0;
2410} 2478}
2411fs_initcall(xen_mmu_debugfs); 2479fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7ce7ff9..215a3ce61068 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
26 */ 149 */
27 150
28#include <linux/init.h> 151#include <linux/init.h>
@@ -30,6 +153,7 @@
30#include <linux/list.h> 153#include <linux/list.h>
31#include <linux/hash.h> 154#include <linux/hash.h>
32#include <linux/sched.h> 155#include <linux/sched.h>
156#include <linux/seq_file.h>
33 157
34#include <asm/cache.h> 158#include <asm/cache.h>
35#include <asm/setup.h> 159#include <asm/setup.h>
@@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); 183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
65static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
66{ 196{
67 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m)
136 * - After resume we're called from within stop_machine, but the mfn 266 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated. 267 * tree should alreay be completely allocated.
138 */ 268 */
139void xen_build_mfn_list_list(void) 269void __ref xen_build_mfn_list_list(void)
140{ 270{
141 unsigned long pfn; 271 unsigned long pfn;
142 272
@@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top); 352 p2m_top_init(p2m_top);
223 353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
224 /* 357 /*
225 * The domain builder gives us a pre-constructed p2m array in 358 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just 359 * mfn_list for all the pages initially given to us, so we just
@@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
266 mididx = p2m_mid_index(pfn); 399 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn); 400 idx = p2m_index(pfn);
268 401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
269 return p2m_top[topidx][mididx][idx]; 410 return p2m_top[topidx][mididx][idx];
270} 411}
271EXPORT_SYMBOL_GPL(get_phys_to_machine); 412EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn)
335 p2m_top_mfn_p[topidx] = mid_mfn; 476 p2m_top_mfn_p[topidx] = mid_mfn;
336 } 477 }
337 478
338 if (p2m_top[topidx][mididx] == p2m_missing) { 479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */ 481 /* p2m leaf page is missing */
340 unsigned long *p2m; 482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
341 484
342 p2m = alloc_p2m_page(); 485 p2m = alloc_p2m_page();
343 if (!p2m) 486 if (!p2m)
@@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn)
345 488
346 p2m_init(p2m); 489 p2m_init(p2m);
347 490
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) 491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
349 free_p2m_page(p2m); 492 free_p2m_page(p2m);
350 else 493 else
351 mid_mfn[mididx] = virt_to_mfn(p2m); 494 mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn)
354 return true; 497 return true;
355} 498}
356 499
500bool __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525
526 p2m_init(p2m);
527
528 p2m_top[topidx][mididx] = p2m;
529
530 }
531 return idx != 0;
532}
533unsigned long set_phys_range_identity(unsigned long pfn_s,
534 unsigned long pfn_e)
535{
536 unsigned long pfn;
537
538 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
539 return 0;
540
541 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
542 return pfn_e - pfn_s;
543
544 if (pfn_s > pfn_e)
545 return 0;
546
547 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
548 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 {
551 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) {
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
556
557 p2m_top[topidx] = mid;
558 }
559 }
560
561 __early_alloc_p2m(pfn_s);
562 __early_alloc_p2m(pfn_e);
563
564 for (pfn = pfn_s; pfn < pfn_e; pfn++)
565 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
566 break;
567
568 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
569 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
570 (pfn_e - pfn_s) - (pfn - pfn_s)))
571 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
572
573 return pfn - pfn_s;
574}
575
357/* Try to install p2m mapping; fail if intermediate bits missing */ 576/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 577bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{ 578{
360 unsigned topidx, mididx, idx; 579 unsigned topidx, mididx, idx;
361 580
581 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
582 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
583 return true;
584 }
362 if (unlikely(pfn >= MAX_P2M_PFN)) { 585 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY); 586 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true; 587 return true;
@@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368 mididx = p2m_mid_index(pfn); 591 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn); 592 idx = p2m_index(pfn);
370 593
594 /* For sparse holes were the p2m leaf has real PFN along with
595 * PCI holes, stick in the PFN as the MFN value.
596 */
597 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
598 if (p2m_top[topidx][mididx] == p2m_identity)
599 return true;
600
601 /* Swap over from MISSING to IDENTITY if needed. */
602 if (p2m_top[topidx][mididx] == p2m_missing) {
603 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
604 p2m_identity) != p2m_missing);
605 return true;
606 }
607 }
608
371 if (p2m_top[topidx][mididx] == p2m_missing) 609 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY; 610 return mfn == INVALID_P2M_ENTRY;
373 611
@@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
378 616
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 617bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{ 618{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 619 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn)) 620 if (!alloc_p2m(pfn))
388 return false; 621 return false;
@@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
421{ 654{
422 unsigned long flags; 655 unsigned long flags;
423 unsigned long pfn; 656 unsigned long pfn;
424 unsigned long address; 657 unsigned long uninitialized_var(address);
425 unsigned level; 658 unsigned level;
426 pte_t *ptep = NULL; 659 pte_t *ptep = NULL;
427 660
@@ -455,7 +688,7 @@ int m2p_remove_override(struct page *page)
455 unsigned long flags; 688 unsigned long flags;
456 unsigned long mfn; 689 unsigned long mfn;
457 unsigned long pfn; 690 unsigned long pfn;
458 unsigned long address; 691 unsigned long uninitialized_var(address);
459 unsigned level; 692 unsigned level;
460 pte_t *ptep = NULL; 693 pte_t *ptep = NULL;
461 694
@@ -520,3 +753,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
520 return ret; 753 return ret;
521} 754}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 755EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
756
757#ifdef CONFIG_XEN_DEBUG_FS
758
759int p2m_dump_show(struct seq_file *m, void *v)
760{
761 static const char * const level_name[] = { "top", "middle",
762 "entry", "abnormal" };
763 static const char * const type_name[] = { "identity", "missing",
764 "pfn", "abnormal"};
765#define TYPE_IDENTITY 0
766#define TYPE_MISSING 1
767#define TYPE_PFN 2
768#define TYPE_UNKNOWN 3
769 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
770 unsigned int uninitialized_var(prev_level);
771 unsigned int uninitialized_var(prev_type);
772
773 if (!p2m_top)
774 return 0;
775
776 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
777 unsigned topidx = p2m_top_index(pfn);
778 unsigned mididx = p2m_mid_index(pfn);
779 unsigned idx = p2m_index(pfn);
780 unsigned lvl, type;
781
782 lvl = 4;
783 type = TYPE_UNKNOWN;
784 if (p2m_top[topidx] == p2m_mid_missing) {
785 lvl = 0; type = TYPE_MISSING;
786 } else if (p2m_top[topidx] == NULL) {
787 lvl = 0; type = TYPE_UNKNOWN;
788 } else if (p2m_top[topidx][mididx] == NULL) {
789 lvl = 1; type = TYPE_UNKNOWN;
790 } else if (p2m_top[topidx][mididx] == p2m_identity) {
791 lvl = 1; type = TYPE_IDENTITY;
792 } else if (p2m_top[topidx][mididx] == p2m_missing) {
793 lvl = 1; type = TYPE_MISSING;
794 } else if (p2m_top[topidx][mididx][idx] == 0) {
795 lvl = 2; type = TYPE_UNKNOWN;
796 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
797 lvl = 2; type = TYPE_IDENTITY;
798 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
799 lvl = 2; type = TYPE_MISSING;
800 } else if (p2m_top[topidx][mididx][idx] == pfn) {
801 lvl = 2; type = TYPE_PFN;
802 } else if (p2m_top[topidx][mididx][idx] != pfn) {
803 lvl = 2; type = TYPE_PFN;
804 }
805 if (pfn == 0) {
806 prev_level = lvl;
807 prev_type = type;
808 }
809 if (pfn == MAX_DOMAIN_PAGES-1) {
810 lvl = 3;
811 type = TYPE_UNKNOWN;
812 }
813 if (prev_type != type) {
814 seq_printf(m, " [0x%lx->0x%lx] %s\n",
815 prev_pfn_type, pfn, type_name[prev_type]);
816 prev_pfn_type = pfn;
817 prev_type = type;
818 }
819 if (prev_level != lvl) {
820 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
821 prev_pfn_level, pfn, level_name[prev_level]);
822 prev_pfn_level = pfn;
823 prev_level = lvl;
824 }
825 }
826 return 0;
827#undef TYPE_IDENTITY
828#undef TYPE_MISSING
829#undef TYPE_PFN
830#undef TYPE_UNKNOWN
831}
832#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a50d446..fa0269a99377 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static __init void xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn;
56
55 u64 size = (u64)pages * PAGE_SIZE; 57 u64 size = (u64)pages * PAGE_SIZE;
56 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57 59
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
66 xen_extra_mem_size += size; 68 xen_extra_mem_size += size;
67 69
68 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
69} 74}
70 75
71static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
104 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105 start, end, ret); 110 start, end, ret);
106 if (ret == 1) { 111 if (ret == 1) {
107 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108 len++; 113 len++;
109 } 114 }
110 } 115 }
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
138 return released; 143 return released;
139} 144}
140 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if (entry->type == E820_RAM) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
141/** 188/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 **/ 190 **/
144char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
145{ 192{
146 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
147 195
148 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
149 unsigned long long mem_end; 197 unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
151 struct xen_memory_map memmap; 199 struct xen_memory_map memmap;
152 unsigned long extra_pages = 0; 200 unsigned long extra_pages = 0;
153 unsigned long extra_limit; 201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
154 int i; 203 int i;
155 int op; 204 int op;
156 205
@@ -176,6 +225,7 @@ char * __init xen_memory_setup(void)
176 } 225 }
177 BUG_ON(rc); 226 BUG_ON(rc);
178 227
228 memcpy(map_raw, map, sizeof(map));
179 e820.nr_map = 0; 229 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 230 xen_extra_mem_start = mem_end;
181 for (i = 0; i < memmap.nr_entries; i++) { 231 for (i = 0; i < memmap.nr_entries; i++) {
@@ -194,6 +244,15 @@ char * __init xen_memory_setup(void)
194 end -= delta; 244 end -= delta;
195 245
196 extra_pages += PFN_DOWN(delta); 246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
197 } 256 }
198 257
199 if (map[i].size > 0 && end > xen_extra_mem_start) 258 if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +310,13 @@ char * __init xen_memory_setup(void)
251 310
252 xen_add_extra_mem(extra_pages); 311 xen_add_extra_mem(extra_pages);
253 312
313 /*
314 * Set P2M for all non-RAM pages and E820 gaps to be identity
315 * type PFNs. We supply it with the non-sanitized version
316 * of the E820.
317 */
318 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
319 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
254 return "Xen"; 320 return "Xen";
255} 321}
256 322
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c7959045..30612441ed99 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
509 xen_fill_possible_map(); 509 xen_fill_possible_map();
510 xen_init_spinlocks(); 510 xen_init_spinlocks();
511} 511}
512
513static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
514{
515 native_smp_prepare_cpus(max_cpus);
516 WARN_ON(xen_smp_intr_init(0));
517
518 if (!xen_have_vector_callback)
519 return;
520 xen_init_lock_cpu(0);
521 xen_init_spinlocks();
522}
523
524static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
525{
526 int rc;
527 rc = native_cpu_up(cpu);
528 WARN_ON (xen_smp_intr_init(cpu));
529 return rc;
530}
531
532static void xen_hvm_cpu_die(unsigned int cpu)
533{
534 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
535 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
536 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
537 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
538 native_cpu_die(cpu);
539}
540
541void __init xen_hvm_smp_init(void)
542{
543 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
544 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
545 smp_ops.cpu_up = xen_hvm_cpu_up;
546 smp_ops.cpu_die = xen_hvm_cpu_die;
547 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
548 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
549}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a129b5..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
37 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
38 } 39 }
39 } 40 }
41#endif
40} 42}
41 43
42void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
43{ 45{
44 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
45 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e3d6a5..2e2d370a47b1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
397 name = "<timer kasprintf failed>"; 397 name = "<timer kasprintf failed>";
398 398
399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 400 IRQF_DISABLED|IRQF_PERCPU|
401 IRQF_NOBALANCING|IRQF_TIMER|
402 IRQF_FORCE_RESUME,
401 name, NULL); 403 name, NULL);
402 404
403 evt = &per_cpu(xen_clock_events, cpu); 405 evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf985757..3112f55638c4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
67 68
68extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
69#else 70#else
70static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
71#endif 73#endif
72 74
73#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS