aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
committerThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
commita18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
treea7d56d88fad5e444d7661484109758a2f436129e /arch/x86
parenta1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)
Merge branch 'consolidate-clksrc-i8253' of master.kernel.org:~rmk/linux-2.6-arm into timers/clocksource
Conflicts: arch/ia64/kernel/cyclone.c arch/mips/kernel/i8253.c arch/x86/kernel/i8253.c Reason: Resolve conflicts so further cleanups do not conflict further Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig63
-rw-r--r--arch/x86/Kconfig.cpu9
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c7
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S11
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c38
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/ia32/ia32entry.S33
-rw-r--r--arch/x86/include/asm/acpi.h20
-rw-r--r--arch/x86/include/asm/amd_nb.h24
-rw-r--r--arch/x86/include/asm/apic.h43
-rw-r--r--arch/x86/include/asm/apicdef.h12
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/ce4100.h6
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/dma.h7
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h5
-rw-r--r--arch/x86/include/asm/frame.h6
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/gart.h24
-rw-r--r--arch/x86/include/asm/hw_irq.h24
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/init.h6
-rw-r--r--arch/x86/include/asm/io_apic.h44
-rw-r--r--arch/x86/include/asm/ipi.h8
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_controller.h12
-rw-r--r--arch/x86/include/asm/irq_vectors.h45
-rw-r--r--arch/x86/include/asm/kdebug.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h12
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/msr-index.h13
-rw-r--r--arch/x86/include/asm/nmi.h5
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/numa.h52
-rw-r--r--arch/x86/include/asm/numa_32.h7
-rw-r--r--arch/x86/include/asm/numa_64.h23
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h14
-rw-r--r--arch/x86/include/asm/page_types.h9
-rw-r--r--arch/x86/include/asm/percpu.h54
-rw-r--r--arch/x86/include/asm/perf_event_p4.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h11
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h70
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/rwsem.h80
-rw-r--r--arch/x86/include/asm/segment.h12
-rw-r--r--arch/x86/include/asm/smp.h20
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h6
-rw-r--r--arch/x86/include/asm/system.h2
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/topology.h19
-rw-r--r--arch/x86/include/asm/trampoline.h33
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/types.h16
-rw-r--r--arch/x86/include/asm/unistd_32.h6
-rw-r--r--arch/x86/include/asm/unistd_64.h8
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h15
-rw-r--r--arch/x86/include/asm/xen/interface.h2
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/asm/xen/pci.h8
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S21
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c77
-rw-r--r--arch/x86/kernel/acpi/sleep.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/amd_iommu_init.c26
-rw-r--r--arch/x86/kernel/amd_nb.c102
-rw-r--r--arch/x86/kernel/apb_timer.c64
-rw-r--r--arch/x86/kernel/aperture_64.c37
-rw-r--r--arch/x86/kernel/apic/apic.c183
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c26
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c34
-rw-r--r--arch/x86/kernel/apic/es7000_32.c35
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c491
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/summit_32.c47
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c12
-rw-r--r--arch/x86/kernel/apm_32.c24
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c69
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c86
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c80
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c30
-rw-r--r--arch/x86/kernel/cpu/perf_event.c199
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c191
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c472
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c83
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c47
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/devicetree.c439
-rw-r--r--arch/x86/kernel/dumpstack.c51
-rw-r--r--arch/x86/kernel/dumpstack_32.c15
-rw-r--r--arch/x86/kernel/dumpstack_64.c14
-rw-r--r--arch/x86/kernel/e820.c19
-rw-r--r--arch/x86/kernel/early-quirks.c21
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S17
-rw-r--r--arch/x86/kernel/ftrace.c15
-rw-r--r--arch/x86/kernel/head32.c9
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S10
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8237.c30
-rw-r--r--arch/x86/kernel/i8253.c76
-rw-r--r--arch/x86/kernel/i8259.c35
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c92
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irqinit.c92
-rw-r--r--arch/x86/kernel/kgdb.c15
-rw-r--r--arch/x86/kernel/kprobes.c8
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c188
-rw-r--r--arch/x86/kernel/microcode_core.c41
-rw-r--r--arch/x86/kernel/mpparse.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/pci-gart_64.c41
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c36
-rw-r--r--arch/x86/kernel/reboot.c129
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c93
-rw-r--r--arch/x86/kernel/setup_percpu.c11
-rw-r--r--arch/x86/kernel/smpboot.c134
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S28
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/verify_cpu.S2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S18
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c150
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c29
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/trace.h8
-rw-r--r--arch/x86/kvm/vmx.c128
-rw-r--r--arch/x86/kvm/x86.c192
-rw-r--r--arch/x86/lguest/boot.c6
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S6
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S6
-rw-r--r--arch/x86/lib/checksum_32.S63
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S65
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/csum-copy_64.S242
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/lib/memmove_64.S197
-rw-r--r--arch/x86/lib/memmove_64.c192
-rw-r--r--arch/x86/lib/rwsem_64.S56
-rw-r--r--arch/x86/lib/semaphore_32.S38
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S27
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology_64.c142
-rw-r--r--arch/x86/mm/fault.c14
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init.c56
-rw-r--r--arch/x86/mm/init_32.c13
-rw-r--r--arch/x86/mm/init_64.c119
-rw-r--r--arch/x86/mm/numa.c207
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c988
-rw-r--r--arch/x86/mm/numa_emulation.c486
-rw-r--r--arch/x86/mm/numa_internal.h31
-rw-r--r--arch/x86/mm/pageattr.c20
-rw-r--r--arch/x86/mm/pgtable.c14
-rw-r--r--arch/x86/mm/srat_32.c10
-rw-r--r--arch/x86/mm/srat_64.c367
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c49
-rw-r--r--arch/x86/oprofile/op_counter.h1
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/ce4100.c9
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/irq.c15
-rw-r--r--arch/x86/pci/xen.c192
-rw-r--r--arch/x86/platform/ce4100/ce4100.c26
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts430
-rw-r--r--arch/x86/platform/mrst/mrst.c12
-rw-r--r--arch/x86/platform/mrst/vrtc.c18
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c25
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c3
-rw-r--r--arch/x86/platform/uv/tlb_uv.c5
-rw-r--r--arch/x86/platform/uv/uv_irq.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c24
-rw-r--r--arch/x86/vdso/vdso32-setup.c15
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/enlighten.c29
-rw-r--r--arch/x86/xen/mmu.c244
-rw-r--r--arch/x86/xen/p2m.c336
-rw-r--r--arch/x86/xen/setup.c70
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/suspend.c8
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
257 files changed, 7012 insertions, 4571 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d30aad..01115d54c5be 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
8 8
9config X86_32 9config X86_32
10 def_bool !64BIT 10 def_bool !64BIT
11 select CLKSRC_I8253
11 12
12config X86_64 13config X86_64
13 def_bool 64BIT 14 def_bool 64BIT
@@ -64,9 +65,14 @@ config X86
64 select HAVE_TEXT_POKE_SMP 65 select HAVE_TEXT_POKE_SMP
65 select HAVE_GENERIC_HARDIRQS 66 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 67 select HAVE_SPARSE_IRQ
68 select GENERIC_FIND_FIRST_BIT
69 select GENERIC_FIND_NEXT_BIT
67 select GENERIC_IRQ_PROBE 70 select GENERIC_IRQ_PROBE
68 select GENERIC_PENDING_IRQ if SMP 71 select GENERIC_PENDING_IRQ if SMP
72 select GENERIC_IRQ_SHOW
73 select IRQ_FORCED_THREADING
69 select USE_GENERIC_SMP_HELPERS if SMP 74 select USE_GENERIC_SMP_HELPERS if SMP
75 select ARCH_NO_SYSDEV_OPS
70 76
71config INSTRUCTION_DECODER 77config INSTRUCTION_DECODER
72 def_bool (KPROBES || PERF_EVENTS) 78 def_bool (KPROBES || PERF_EVENTS)
@@ -119,7 +125,7 @@ config NEED_SG_DMA_LENGTH
119 def_bool y 125 def_bool y
120 126
121config GENERIC_ISA_DMA 127config GENERIC_ISA_DMA
122 def_bool y 128 def_bool ISA_DMA_API
123 129
124config GENERIC_IOMAP 130config GENERIC_IOMAP
125 def_bool y 131 def_bool y
@@ -139,7 +145,7 @@ config GENERIC_GPIO
139 bool 145 bool
140 146
141config ARCH_MAY_HAVE_PC_FDC 147config ARCH_MAY_HAVE_PC_FDC
142 def_bool y 148 def_bool ISA_DMA_API
143 149
144config RWSEM_GENERIC_SPINLOCK 150config RWSEM_GENERIC_SPINLOCK
145 def_bool !X86_XADD 151 def_bool !X86_XADD
@@ -217,10 +223,6 @@ config X86_HT
217 def_bool y 223 def_bool y
218 depends on SMP 224 depends on SMP
219 225
220config X86_TRAMPOLINE
221 def_bool y
222 depends on SMP || (64BIT && ACPI_SLEEP)
223
224config X86_32_LAZY_GS 226config X86_32_LAZY_GS
225 def_bool y 227 def_bool y
226 depends on X86_32 && !CC_STACKPROTECTOR 228 depends on X86_32 && !CC_STACKPROTECTOR
@@ -382,6 +384,8 @@ config X86_INTEL_CE
382 depends on X86_32 384 depends on X86_32
383 depends on X86_EXTENDED_PLATFORM 385 depends on X86_EXTENDED_PLATFORM
384 select X86_REBOOTFIXUPS 386 select X86_REBOOTFIXUPS
387 select OF
388 select OF_EARLY_FLATTREE
385 ---help--- 389 ---help---
386 Select for the Intel CE media processor (CE4100) SOC. 390 Select for the Intel CE media processor (CE4100) SOC.
387 This option compiles in support for the CE4100 SOC for settop 391 This option compiles in support for the CE4100 SOC for settop
@@ -811,7 +815,7 @@ config X86_LOCAL_APIC
811 815
812config X86_IO_APIC 816config X86_IO_APIC
813 def_bool y 817 def_bool y
814 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 818 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
815 819
816config X86_VISWS_APIC 820config X86_VISWS_APIC
817 def_bool y 821 def_bool y
@@ -1705,7 +1709,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1705 depends on NUMA 1709 depends on NUMA
1706 1710
1707config USE_PERCPU_NUMA_NODE_ID 1711config USE_PERCPU_NUMA_NODE_ID
1708 def_bool X86_64 1712 def_bool y
1709 depends on NUMA 1713 depends on NUMA
1710 1714
1711menu "Power management and ACPI options" 1715menu "Power management and ACPI options"
@@ -2000,9 +2004,13 @@ source "drivers/pci/pcie/Kconfig"
2000 2004
2001source "drivers/pci/Kconfig" 2005source "drivers/pci/Kconfig"
2002 2006
2003# x86_64 have no ISA slots, but do have ISA-style DMA. 2007# x86_64 have no ISA slots, but can have ISA-style DMA.
2004config ISA_DMA_API 2008config ISA_DMA_API
2005 def_bool y 2009 bool "ISA-style DMA support" if (X86_64 && EXPERT)
2010 default y
2011 help
2012 Enables ISA-style DMA support for devices requiring such controllers.
2013 If unsure, say Y.
2006 2014
2007if X86_32 2015if X86_32
2008 2016
@@ -2066,9 +2074,10 @@ config SCx200HR_TIMER
2066 2074
2067config OLPC 2075config OLPC
2068 bool "One Laptop Per Child support" 2076 bool "One Laptop Per Child support"
2077 depends on !X86_PAE
2069 select GPIOLIB 2078 select GPIOLIB
2070 select OLPC_OPENFIRMWARE 2079 select OF
2071 depends on !X86_64 && !X86_PAE 2080 select OF_PROMTREE if PROC_DEVICETREE
2072 ---help--- 2081 ---help---
2073 Add support for detecting the unique features of the OLPC 2082 Add support for detecting the unique features of the OLPC
2074 XO hardware. 2083 XO hardware.
@@ -2079,21 +2088,6 @@ config OLPC_XO1
2079 ---help--- 2088 ---help---
2080 Add support for non-essential features of the OLPC XO-1 laptop. 2089 Add support for non-essential features of the OLPC XO-1 laptop.
2081 2090
2082config OLPC_OPENFIRMWARE
2083 bool "Support for OLPC's Open Firmware"
2084 depends on !X86_64 && !X86_PAE
2085 default n
2086 select OF
2087 help
2088 This option adds support for the implementation of Open Firmware
2089 that is used on the OLPC XO-1 Children's Machine.
2090 If unsure, say N here.
2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2097endif # X86_32 2091endif # X86_32
2098 2092
2099config AMD_NB 2093config AMD_NB
@@ -2104,6 +2098,16 @@ source "drivers/pcmcia/Kconfig"
2104 2098
2105source "drivers/pci/hotplug/Kconfig" 2099source "drivers/pci/hotplug/Kconfig"
2106 2100
2101config RAPIDIO
2102 bool "RapidIO support"
2103 depends on PCI
2104 default n
2105 help
2106 If you say Y here, the kernel will include drivers and
2107 infrastructure code to support RapidIO interconnect devices.
2108
2109source "drivers/rapidio/Kconfig"
2110
2107endmenu 2111endmenu
2108 2112
2109 2113
@@ -2138,6 +2142,11 @@ config SYSVIPC_COMPAT
2138 def_bool y 2142 def_bool y
2139 depends on COMPAT && SYSVIPC 2143 depends on COMPAT && SYSVIPC
2140 2144
2145config KEYS_COMPAT
2146 bool
2147 depends on COMPAT && KEYS
2148 default y
2149
2141endmenu 2150endmenu
2142 2151
2143 2152
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 283c5a6a03a6..d161e939df62 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,11 +294,6 @@ config X86_GENERIC
294 294
295endif 295endif
296 296
297config X86_CPU
298 def_bool y
299 select GENERIC_FIND_FIRST_BIT
300 select GENERIC_FIND_NEXT_BIT
301
302# 297#
303# Define implied options from the CPU selection here 298# Define implied options from the CPU selection here
304config X86_INTERNODE_CACHE_SHIFT 299config X86_INTERNODE_CACHE_SHIFT
@@ -331,7 +326,7 @@ config X86_PPRO_FENCE
331 Old PentiumPro multiprocessor systems had errata that could cause 326 Old PentiumPro multiprocessor systems had errata that could cause
332 memory operations to violate the x86 ordering standard in rare cases. 327 memory operations to violate the x86 ordering standard in rare cases.
333 Enabling this option will attempt to work around some (but not all) 328 Enabling this option will attempt to work around some (but not all)
334 occurances of this problem, at the cost of much heavier spinlock and 329 occurrences of this problem, at the cost of much heavier spinlock and
335 memory barrier operations. 330 memory barrier operations.
336 331
337 If unsure, say n here. Even distro kernels should think twice before 332 If unsure, say n here. Even distro kernels should think twice before
@@ -371,7 +366,7 @@ config X86_INTEL_USERCOPY
371 366
372config X86_USE_PPRO_CHECKSUM 367config X86_USE_PPRO_CHECKSUM
373 def_bool y 368 def_bool y
374 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM 369 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
375 370
376config X86_USE_3DNOW 371config X86_USE_3DNOW
377 def_bool y 372 def_bool y
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 646aa78ba5fd..46a823882437 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -62,7 +62,12 @@ int main(int argc, char *argv[])
62 if (fseek(f, -4L, SEEK_END)) { 62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]); 63 perror(argv[1]);
64 } 64 }
65 fread(&olen, sizeof olen, 1, f); 65
66 if (fread(&olen, sizeof(olen), 1, f) != 1) {
67 perror(argv[1]);
68 return 1;
69 }
70
66 ilen = ftell(f); 71 ilen = ftell(f);
67 olen = getle32(&olen); 72 olen = getle32(&olen);
68 fclose(f); 73 fclose(f);
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035e..db75d07c3645 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -91,7 +91,7 @@ static int detect_memory_e801(void)
91 if (oreg.ax > 15*1024) { 91 if (oreg.ax > 15*1024) {
92 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
93 } else if (oreg.ax == 15*1024) { 93 } else if (oreg.ax == 15*1024) {
94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; 94 boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
95 } else { 95 } else {
96 /* 96 /*
97 * This ignores memory above 16MB if we have a memory 97 * This ignores memory above 16MB if we have a memory
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 8fe2a4966b7a..be6d9e365a80 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt:
1346 and $15, %r13 # %r13 = arg4 (mod 16) 1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt 1347 je _multiple_of_16_bytes_decrypt
1348 1348
1349 # Handle the last <16 byte block seperately 1349 # Handle the last <16 byte block separately
1350 1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10 1352 movdqa SHUF_MASK(%rip), %xmm10
@@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt:
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11 1356 sub $16, %r11
1357 add %r13, %r11 1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block 1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12 1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12 1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
@@ -1607,11 +1607,12 @@ _zero_cipher_left_encrypt:
1607 and $15, %r13 # %r13 = arg4 (mod 16) 1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt 1608 je _multiple_of_16_bytes_encrypt
1609 1609
1610 # Handle the last <16 Byte block seperately 1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10 1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0 1613 PSHUFB_XMM %xmm10, %xmm0
1614 1614
1615
1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1616 sub $16, %r11 1617 sub $16, %r11
1617 add %r13, %r11 1618 add %r13, %r11
@@ -1634,7 +1635,9 @@ _zero_cipher_left_encrypt:
1634 # GHASH computation for the last <16 byte block 1635 # GHASH computation for the last <16 byte block
1635 sub %r13, %r11 1636 sub %r13, %r11
1636 add $16, %r11 1637 add $16, %r11
1637 PSHUFB_XMM %xmm10, %xmm1 1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1638 1641
1639 # shuffle xmm0 back to output as ciphertext 1642 # shuffle xmm0 back to output as ciphertext
1640 1643
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e1e60c7d5813..2577613fb32b 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -828,9 +828,15 @@ static int rfc4106_init(struct crypto_tfm *tfm)
828 struct cryptd_aead *cryptd_tfm; 828 struct cryptd_aead *cryptd_tfm;
829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) 829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); 830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
831 struct crypto_aead *cryptd_child;
832 struct aesni_rfc4106_gcm_ctx *child_ctx;
831 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); 833 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
832 if (IS_ERR(cryptd_tfm)) 834 if (IS_ERR(cryptd_tfm))
833 return PTR_ERR(cryptd_tfm); 835 return PTR_ERR(cryptd_tfm);
836
837 cryptd_child = cryptd_aead_child(cryptd_tfm);
838 child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
839 memcpy(child_ctx, ctx, sizeof(*ctx));
834 ctx->cryptd_tfm = cryptd_tfm; 840 ctx->cryptd_tfm = cryptd_tfm;
835 tfm->crt_aead.reqsize = sizeof(struct aead_request) 841 tfm->crt_aead.reqsize = sizeof(struct aead_request)
836 + crypto_aead_reqsize(&cryptd_tfm->base); 842 + crypto_aead_reqsize(&cryptd_tfm->base);
@@ -873,22 +879,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0); 879 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874 880
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); 881 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) { 882 if (ret)
877 crypto_free_ablkcipher(ctr_tfm); 883 goto out_free_ablkcipher;
878 return ret;
879 }
880 884
885 ret = -ENOMEM;
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); 886 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) { 887 if (!req)
883 crypto_free_ablkcipher(ctr_tfm); 888 goto out_free_ablkcipher;
884 return -EINVAL;
885 }
886 889
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); 890 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) { 891 if (!req_data)
889 crypto_free_ablkcipher(ctr_tfm); 892 goto out_free_request;
890 return -ENOMEM; 893
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv)); 894 memset(req_data->iv, 0, sizeof(req_data->iv));
893 895
894 /* Clear the data in the hash sub key container to zero.*/ 896 /* Clear the data in the hash sub key container to zero.*/
@@ -913,8 +915,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
913 if (!ret) 915 if (!ret)
914 ret = req_data->result.err; 916 ret = req_data->result.err;
915 } 917 }
916 ablkcipher_request_free(req);
917 kfree(req_data); 918 kfree(req_data);
919out_free_request:
920 ablkcipher_request_free(req);
921out_free_ablkcipher:
918 crypto_free_ablkcipher(ctr_tfm); 922 crypto_free_ablkcipher(ctr_tfm);
919 return ret; 923 return ret;
920} 924}
@@ -925,6 +929,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
925 int ret = 0; 929 int ret = 0;
926 struct crypto_tfm *tfm = crypto_aead_tfm(parent); 930 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
927 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 931 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
932 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
933 struct aesni_rfc4106_gcm_ctx *child_ctx =
934 aesni_rfc4106_gcm_ctx_get(cryptd_child);
928 u8 *new_key_mem = NULL; 935 u8 *new_key_mem = NULL;
929 936
930 if (key_len < 4) { 937 if (key_len < 4) {
@@ -968,6 +975,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
968 goto exit; 975 goto exit;
969 } 976 }
970 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); 977 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
978 memcpy(child_ctx, ctx, sizeof(*ctx));
971exit: 979exit:
972 kfree(new_key_mem); 980 kfree(new_key_mem);
973 return ret; 981 return ret;
@@ -999,7 +1007,6 @@ static int rfc4106_encrypt(struct aead_request *req)
999 int ret; 1007 int ret;
1000 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1008 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1001 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 1009 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1002 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1003 1010
1004 if (!irq_fpu_usable()) { 1011 if (!irq_fpu_usable()) {
1005 struct aead_request *cryptd_req = 1012 struct aead_request *cryptd_req =
@@ -1008,6 +1015,7 @@ static int rfc4106_encrypt(struct aead_request *req)
1008 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); 1015 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1009 return crypto_aead_encrypt(cryptd_req); 1016 return crypto_aead_encrypt(cryptd_req);
1010 } else { 1017 } else {
1018 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1011 kernel_fpu_begin(); 1019 kernel_fpu_begin();
1012 ret = cryptd_child->base.crt_aead.encrypt(req); 1020 ret = cryptd_child->base.crt_aead.encrypt(req);
1013 kernel_fpu_end(); 1021 kernel_fpu_end();
@@ -1020,7 +1028,6 @@ static int rfc4106_decrypt(struct aead_request *req)
1020 int ret; 1028 int ret;
1021 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1029 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1022 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 1030 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1023 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1024 1031
1025 if (!irq_fpu_usable()) { 1032 if (!irq_fpu_usable()) {
1026 struct aead_request *cryptd_req = 1033 struct aead_request *cryptd_req =
@@ -1029,6 +1036,7 @@ static int rfc4106_decrypt(struct aead_request *req)
1029 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); 1036 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1030 return crypto_aead_decrypt(cryptd_req); 1037 return crypto_aead_decrypt(cryptd_req);
1031 } else { 1038 } else {
1039 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1032 kernel_fpu_begin(); 1040 kernel_fpu_begin();
1033 ret = cryptd_child->base.crt_aead.decrypt(req); 1041 ret = cryptd_child->base.crt_aead.decrypt(req);
1034 kernel_fpu_end(); 1042 kernel_fpu_end();
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2d93bdbc9ac0..fd843877e841 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
298 /* OK, This is the point of no return */ 298 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 299 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 300 set_thread_flag(TIF_IA32);
301 current->mm->context.ia32_compat = 1;
301 302
302 setup_new_exec(bprm); 303 setup_new_exec(bprm);
303 304
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99c3394..849a9d23c71d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
25#define sysretl_audit ia32_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28 .section .entry.text, "ax"
29
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
29 31
30 .macro IA32_ARG_FIXUP noebp=0 32 .macro IA32_ARG_FIXUP noebp=0
@@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target)
126 */ 128 */
127 ENABLE_INTERRUPTS(CLBR_NONE) 129 ENABLE_INTERRUPTS(CLBR_NONE)
128 movl %ebp,%ebp /* zero extension */ 130 movl %ebp,%ebp /* zero extension */
129 pushq $__USER32_DS 131 pushq_cfi $__USER32_DS
130 CFI_ADJUST_CFA_OFFSET 8
131 /*CFI_REL_OFFSET ss,0*/ 132 /*CFI_REL_OFFSET ss,0*/
132 pushq %rbp 133 pushq_cfi %rbp
133 CFI_ADJUST_CFA_OFFSET 8
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq 135 pushfq_cfi
136 CFI_ADJUST_CFA_OFFSET 8
137 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
138 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
139 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
140 pushq $__USER32_CS 139 pushq_cfi $__USER32_CS
141 CFI_ADJUST_CFA_OFFSET 8
142 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
143 movl %eax, %eax 141 movl %eax, %eax
144 pushq %r10 142 pushq_cfi %r10
145 CFI_ADJUST_CFA_OFFSET 8
146 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
147 pushq %rax 144 pushq_cfi %rax
148 CFI_ADJUST_CFA_OFFSET 8
149 cld 145 cld
150 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,0,1
151 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
@@ -182,11 +178,9 @@ sysexit_from_sys_call:
182 xorq %r9,%r9 178 xorq %r9,%r9
183 xorq %r10,%r10 179 xorq %r10,%r10
184 xorq %r11,%r11 180 xorq %r11,%r11
185 popfq 181 popfq_cfi
186 CFI_ADJUST_CFA_OFFSET -8
187 /*CFI_RESTORE rflags*/ 182 /*CFI_RESTORE rflags*/
188 popq %rcx /* User %esp */ 183 popq_cfi %rcx /* User %esp */
189 CFI_ADJUST_CFA_OFFSET -8
190 CFI_REGISTER rsp,rcx 184 CFI_REGISTER rsp,rcx
191 TRACE_IRQS_ON 185 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS_SYSEXIT32 186 ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +415,7 @@ ENTRY(ia32_syscall)
421 */ 415 */
422 ENABLE_INTERRUPTS(CLBR_NONE) 416 ENABLE_INTERRUPTS(CLBR_NONE)
423 movl %eax,%eax 417 movl %eax,%eax
424 pushq %rax 418 pushq_cfi %rax
425 CFI_ADJUST_CFA_OFFSET 8
426 cld 419 cld
427 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
428 this could be a problem. */ 421 this could be a problem. */
@@ -851,4 +844,8 @@ ia32_sys_call_table:
851 .quad sys_fanotify_init 844 .quad sys_fanotify_init
852 .quad sys32_fanotify_mark 845 .quad sys32_fanotify_mark
853 .quad sys_prlimit64 /* 340 */ 846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
854ia32_syscall_end: 851ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 211ca3f7fd16..12e0e7dd869c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h>
32 33
33#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -88,6 +89,7 @@ extern int acpi_disabled;
88extern int acpi_pci_disabled; 89extern int acpi_pci_disabled;
89extern int acpi_skip_timer_override; 90extern int acpi_skip_timer_override;
90extern int acpi_use_timer_override; 91extern int acpi_use_timer_override;
92extern int acpi_fix_pin2_polarity;
91 93
92extern u8 acpi_sci_flags; 94extern u8 acpi_sci_flags;
93extern int acpi_sci_override_gsi; 95extern int acpi_sci_override_gsi;
@@ -112,11 +114,11 @@ static inline void acpi_disable_pci(void)
112 acpi_noirq_set(); 114 acpi_noirq_set();
113} 115}
114 116
115/* routines for saving/restoring kernel state */ 117/* Low-level suspend routine. */
116extern int acpi_save_state_mem(void); 118extern int acpi_suspend_lowlevel(void);
117extern void acpi_restore_state_mem(void);
118 119
119extern unsigned long acpi_wakeup_address; 120extern const unsigned char acpi_wakeup_code[];
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
120 122
121/* early initialization routine */ 123/* early initialization routine */
122extern void acpi_reserve_wakeup_memory(void); 124extern void acpi_reserve_wakeup_memory(void);
@@ -185,15 +187,7 @@ struct bootnode;
185 187
186#ifdef CONFIG_ACPI_NUMA 188#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 189extern int acpi_numa;
188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 190extern int x86_acpi_numa_init(void);
189 unsigned long end);
190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
192
193#ifdef CONFIG_NUMA_EMU
194extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
195 int num_nodes);
196#endif
197#endif /* CONFIG_ACPI_NUMA */ 191#endif /* CONFIG_ACPI_NUMA */
198 192
199#define acpi_unlazy_tlb(x) leave_mm(x) 193#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 64dc82ee19f0..331682231bb4 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -9,23 +9,20 @@ struct amd_nb_bus_dev_range {
9 u8 dev_limit; 9 u8 dev_limit;
10}; 10};
11 11
12extern struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode; 14struct bootnode;
15 15
16extern int early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
18extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 19extern int amd_numa_init(void);
20extern int amd_scan_nodes(void); 20extern int amd_get_subcaches(int);
21 21extern int amd_set_subcaches(int, int);
22#ifdef CONFIG_NUMA_EMU
23extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
24extern void amd_get_nodes(struct bootnode *nodes);
25#endif
26 22
27struct amd_northbridge { 23struct amd_northbridge {
28 struct pci_dev *misc; 24 struct pci_dev *misc;
25 struct pci_dev *link;
29}; 26};
30 27
31struct amd_northbridge_info { 28struct amd_northbridge_info {
@@ -35,17 +32,18 @@ struct amd_northbridge_info {
35}; 32};
36extern struct amd_northbridge_info amd_northbridges; 33extern struct amd_northbridge_info amd_northbridges;
37 34
38#define AMD_NB_GART 0x1 35#define AMD_NB_GART BIT(0)
39#define AMD_NB_L3_INDEX_DISABLE 0x2 36#define AMD_NB_L3_INDEX_DISABLE BIT(1)
37#define AMD_NB_L3_PARTITIONING BIT(2)
40 38
41#ifdef CONFIG_AMD_NB 39#ifdef CONFIG_AMD_NB
42 40
43static inline int amd_nb_num(void) 41static inline u16 amd_nb_num(void)
44{ 42{
45 return amd_northbridges.num; 43 return amd_northbridges.num;
46} 44}
47 45
48static inline int amd_nb_has_feature(int feature) 46static inline bool amd_nb_has_feature(unsigned feature)
49{ 47{
50 return ((amd_northbridges.flags & feature) == feature); 48 return ((amd_northbridges.flags & feature) == feature);
51} 49}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3c896946f4cc..2b7d573be549 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_APIC_H 2#define _ASM_X86_APIC_H
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/delay.h>
6#include <linux/pm.h> 5#include <linux/pm.h>
7 6
8#include <asm/alternative.h> 7#include <asm/alternative.h>
@@ -220,7 +219,6 @@ extern void enable_IR_x2apic(void);
220 219
221extern int get_physical_broadcast(void); 220extern int get_physical_broadcast(void);
222 221
223extern void apic_disable(void);
224extern int lapic_get_maxlvt(void); 222extern int lapic_get_maxlvt(void);
225extern void clear_local_APIC(void); 223extern void clear_local_APIC(void);
226extern void connect_bsp_APIC(void); 224extern void connect_bsp_APIC(void);
@@ -228,7 +226,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
228extern void disable_local_APIC(void); 226extern void disable_local_APIC(void);
229extern void lapic_shutdown(void); 227extern void lapic_shutdown(void);
230extern int verify_local_APIC(void); 228extern int verify_local_APIC(void);
231extern void cache_APIC_registers(void);
232extern void sync_Arb_IDs(void); 229extern void sync_Arb_IDs(void);
233extern void init_bsp_APIC(void); 230extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 231extern void setup_local_APIC(void);
@@ -239,8 +236,7 @@ void register_lapic_address(unsigned long address);
239extern void setup_boot_APIC_clock(void); 236extern void setup_boot_APIC_clock(void);
240extern void setup_secondary_APIC_clock(void); 237extern void setup_secondary_APIC_clock(void);
241extern int APIC_init_uniprocessor(void); 238extern int APIC_init_uniprocessor(void);
242extern void enable_NMI_through_LVT0(void); 239extern int apic_force_enable(unsigned long addr);
243extern int apic_force_enable(void);
244 240
245/* 241/*
246 * On 32bit this is mach-xxx local 242 * On 32bit this is mach-xxx local
@@ -261,7 +257,6 @@ static inline void lapic_shutdown(void) { }
261#define local_apic_timer_c2_ok 1 257#define local_apic_timer_c2_ok 1
262static inline void init_apic_mappings(void) { } 258static inline void init_apic_mappings(void) { }
263static inline void disable_local_APIC(void) { } 259static inline void disable_local_APIC(void) { }
264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop 260# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop 261# define setup_secondary_APIC_clock x86_init_noop
267#endif /* !CONFIG_X86_LOCAL_APIC */ 262#endif /* !CONFIG_X86_LOCAL_APIC */
@@ -307,8 +302,6 @@ struct apic {
307 302
308 void (*setup_apic_routing)(void); 303 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 304 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 305 int (*cpu_present_to_apicid)(int mps_cpu);
313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); 306 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 307 void (*setup_portio_remap)(void);
@@ -356,6 +349,23 @@ struct apic {
356 void (*icr_write)(u32 low, u32 high); 349 void (*icr_write)(u32 low, u32 high);
357 void (*wait_icr_idle)(void); 350 void (*wait_icr_idle)(void);
358 u32 (*safe_wait_icr_idle)(void); 351 u32 (*safe_wait_icr_idle)(void);
352
353#ifdef CONFIG_X86_32
354 /*
355 * Called very early during boot from get_smp_config(). It should
356 * return the logical apicid. x86_[bios]_cpu_to_apicid is
357 * initialized before this function is called.
358 *
359 * If logical apicid can't be determined that early, the function
360 * may return BAD_APICID. Logical apicid will be configured after
361 * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
362 * won't be applied properly during early boot in this case.
363 */
364 int (*x86_32_early_logical_apicid)(int cpu);
365
366 /* determine CPU -> NUMA node mapping */
367 int (*x86_32_numa_cpu_node)(int cpu);
368#endif
359}; 369};
360 370
361/* 371/*
@@ -503,6 +513,11 @@ extern struct apic apic_noop;
503 513
504extern struct apic apic_default; 514extern struct apic apic_default;
505 515
516static inline int noop_x86_32_early_logical_apicid(int cpu)
517{
518 return BAD_APICID;
519}
520
506/* 521/*
507 * Set up the logical destination ID. 522 * Set up the logical destination ID.
508 * 523 *
@@ -522,7 +537,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
522 return cpuid_apic >> index_msb; 537 return cpuid_apic >> index_msb;
523} 538}
524 539
525extern int default_apicid_to_node(int logical_apicid); 540extern int default_x86_32_numa_cpu_node(int cpu);
526 541
527#endif 542#endif
528 543
@@ -558,12 +573,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
558 *retmap = *phys_map; 573 *retmap = *phys_map;
559} 574}
560 575
561/* Mapping from cpu number to logical apicid */
562static inline int default_cpu_to_logical_apicid(int cpu)
563{
564 return 1 << cpu;
565}
566
567static inline int __default_cpu_present_to_apicid(int mps_cpu) 576static inline int __default_cpu_present_to_apicid(int mps_cpu)
568{ 577{
569 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) 578 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -596,8 +605,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
596 605
597#endif /* CONFIG_X86_LOCAL_APIC */ 606#endif /* CONFIG_X86_LOCAL_APIC */
598 607
599#ifdef CONFIG_X86_32
600extern u8 cpu_2_logical_apicid[NR_CPUS];
601#endif
602
603#endif /* _ASM_X86_APIC_H */ 608#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 47a30ff8e517..d87988bacf3e 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -426,4 +426,16 @@ struct local_apic {
426#else 426#else
427 #define BAD_APICID 0xFFFFu 427 #define BAD_APICID 0xFFFFu
428#endif 428#endif
429
430enum ioapic_irq_destination_types {
431 dest_Fixed = 0,
432 dest_LowestPrio = 1,
433 dest_SMI = 2,
434 dest__reserved_1 = 3,
435 dest_NMI = 4,
436 dest_INIT = 5,
437 dest__reserved_2 = 6,
438 dest_ExtINT = 7
439};
440
429#endif /* _ASM_X86_APICDEF_H */ 441#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 903683b07e42..69d58131bc8e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -456,14 +456,12 @@ static inline int fls(int x)
456 456
457#ifdef __KERNEL__ 457#ifdef __KERNEL__
458 458
459#include <asm-generic/bitops/ext2-non-atomic.h> 459#include <asm-generic/bitops/le.h>
460 460
461#define ext2_set_bit_atomic(lock, nr, addr) \ 461#define ext2_set_bit_atomic(lock, nr, addr) \
462 test_and_set_bit((nr), (unsigned long *)(addr)) 462 test_and_set_bit((nr), (unsigned long *)(addr))
463#define ext2_clear_bit_atomic(lock, nr, addr) \ 463#define ext2_clear_bit_atomic(lock, nr, addr) \
464 test_and_clear_bit((nr), (unsigned long *)(addr)) 464 test_and_clear_bit((nr), (unsigned long *)(addr))
465 465
466#include <asm-generic/bitops/minix.h>
467
468#endif /* __KERNEL__ */ 466#endif /* __KERNEL__ */
469#endif /* _ASM_X86_BITOPS_H */ 467#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index c8bfe63a06de..e020d88ec02d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
12/* setup data types */ 12/* setup data types */
13#define SETUP_NONE 0 13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1 14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
15 16
16/* extensible setup data list node */ 17/* extensible setup data list node */
17struct setup_data { 18struct setup_data {
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 62f084478f7e..4e12668711e5 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -71,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
71 * Read/Write : ReadOnly, ReadWrite 71 * Read/Write : ReadOnly, ReadWrite
72 * Presence : NotPresent 72 * Presence : NotPresent
73 * 73 *
74 * Within a catagory, the attributes are mutually exclusive. 74 * Within a category, the attributes are mutually exclusive.
75 * 75 *
76 * The implementation of this API will take care of various aspects that 76 * The implementation of this API will take care of various aspects that
77 * are associated with changing such attributes, such as: 77 * are associated with changing such attributes, such as:
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
new file mode 100644
index 000000000000..e656ad8c0a2e
--- /dev/null
+++ b/arch/x86/include/asm/ce4100.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_CE4100_H_
2#define _ASM_CE4100_H_
3
4int ce4100_pci_init(void);
5
6#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 220e2ea08e80..91f3e087cf21 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -160,6 +160,7 @@
160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ 161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ 162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
163#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
163 164
164/* 165/*
165 * Auxiliary flags: Linux defined - For features scattered in various 166 * Auxiliary flags: Linux defined - For features scattered in various
@@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32];
279#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 280#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
280#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 281#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
281#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 282#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
283#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
282 284
283#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 285#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
284# define cpu_has_invlpg 1 286# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..057099e5faba 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -10,7 +10,6 @@
10 10
11#include <linux/spinlock.h> /* And spinlocks */ 11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */ 12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14 13
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER 14#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p 15#define dma_outb outb_p
@@ -151,6 +150,7 @@
151#define DMA_AUTOINIT 0x10 150#define DMA_AUTOINIT 0x10
152 151
153 152
153#ifdef CONFIG_ISA_DMA_API
154extern spinlock_t dma_spin_lock; 154extern spinlock_t dma_spin_lock;
155 155
156static inline unsigned long claim_dma_lock(void) 156static inline unsigned long claim_dma_lock(void)
@@ -164,6 +164,7 @@ static inline void release_dma_lock(unsigned long flags)
164{ 164{
165 spin_unlock_irqrestore(&dma_spin_lock, flags); 165 spin_unlock_irqrestore(&dma_spin_lock, flags);
166} 166}
167#endif /* CONFIG_ISA_DMA_API */
167 168
168/* enable/disable a specific DMA channel */ 169/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr) 170static inline void enable_dma(unsigned int dmanr)
@@ -303,9 +304,11 @@ static inline int get_dma_residue(unsigned int dmanr)
303} 304}
304 305
305 306
306/* These are in kernel/dma.c: */ 307/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
308#ifdef CONFIG_ISA_DMA_API
307extern int request_dma(unsigned int dmanr, const char *device_id); 309extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr); 310extern void free_dma(unsigned int dmanr);
311#endif
309 312
310/* From PCI */ 313/* From PCI */
311 314
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index e99d55d74df5..908b96957d88 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -96,7 +96,7 @@ extern void e820_setup_gap(void);
96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
97 unsigned long start_addr, unsigned long long end_addr); 97 unsigned long start_addr, unsigned long long end_addr);
98struct setup_data; 98struct setup_data;
99extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); 99extern void parse_e820_ext(struct setup_data *data);
100 100
101#if defined(CONFIG_X86_64) || \ 101#if defined(CONFIG_X86_64) || \
102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 57650ab4a5f5..1cd6d26a0a8d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18 18
19.irpc idx, "01234567" 19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
20BUILD_INTERRUPT3(invalidate_interrupt\idx, 22BUILD_INTERRUPT3(invalidate_interrupt\idx,
21 (INVALIDATE_TLB_VECTOR_START)+\idx, 23 (INVALIDATE_TLB_VECTOR_START)+\idx,
22 smp_invalidate_interrupt) 24 smp_invalidate_interrupt)
25.endif
23.endr 26.endr
24#endif 27#endif
25 28
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e1..2c6fc9e62812 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
7 frame pointer later */ 7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 9 .macro FRAME
10 pushl %ebp 10 pushl_cfi %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0 11 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp 12 movl %esp,%ebp
14 .endm 13 .endm
15 .macro ENDFRAME 14 .macro ENDFRAME
16 popl %ebp 15 popl_cfi %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp 16 CFI_RESTORE ebp
19 .endm 17 .endm
20#else 18#else
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e956..d09bb03653f0 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
37 "+m" (*uaddr), "=&r" (tem) \ 37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0)) 38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39 39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 40static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
41{ 41{
42 int op = (encoded_op >> 28) & 7; 42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15; 43 int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg; 49 oparg = 1 << oparg;
50 50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
52 return -EFAULT; 52 return -EFAULT;
53 53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
109 return ret; 109 return ret;
110} 110}
111 111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 112static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 int newval) 113 u32 oldval, u32 newval)
114{ 114{
115 int ret = 0;
115 116
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 117#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */ 118 /* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
119 return -ENOSYS; 120 return -ENOSYS;
120#endif 121#endif
121 122
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 123 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
123 return -EFAULT; 124 return -EFAULT;
124 125
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" 126 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
126 "2:\t.section .fixup, \"ax\"\n" 127 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n" 128 "3:\tmov %3, %0\n"
128 "\tjmp 2b\n" 129 "\tjmp 2b\n"
129 "\t.previous\n" 130 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b) 131 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr) 132 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval) 133 : "i" (-EFAULT), "r" (newval), "1" (oldval)
133 : "memory" 134 : "memory"
134 ); 135 );
135 136
136 return oldval; 137 *uval = oldval;
138 return ret;
137} 139}
138 140
139#endif 141#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 43085bfc99c3..156cd5d18d2a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
66 * Don't enable translation but enable GART IO and CPU accesses. 66 * Don't enable translation but enable GART IO and CPU accesses.
67 * Also, set DISTLBWALKPRB since GART tables memory is UC. 67 * Also, set DISTLBWALKPRB since GART tables memory is UC.
68 */ 68 */
69 ctl = DISTLBWALKPRB | order << 1; 69 ctl = order << 1;
70 70
71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
72} 72}
@@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
75{ 75{
76 u32 tmp, ctl; 76 u32 tmp, ctl;
77 77
78 /* address of the mappings table */ 78 /* address of the mappings table */
79 addr >>= 12; 79 addr >>= 12;
80 tmp = (u32) addr<<4; 80 tmp = (u32) addr<<4;
81 tmp &= ~0xf; 81 tmp &= ~0xf;
82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); 82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
83 83
84 /* Enable GART translation for this hammer. */ 84 /* Enable GART translation for this hammer. */
85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
86 ctl |= GARTEN; 86 ctl |= GARTEN | DISTLBWALKPRB;
87 ctl &= ~(DISGARTCPU | DISGARTIO); 87 ctl &= ~(DISGARTCPU | DISGARTIO);
88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
89} 89}
90 90
91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) 91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 0274ec5a7e62..bb9efe8706e2 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void);
48 72
49extern void irq_move_cleanup_interrupt(void); 73extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void); 74extern void reboot_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ef328901c802..c9e09ea05644 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -237,7 +237,7 @@ static inline void fpu_save_init(struct fpu *fpu)
237 } else if (use_fxsr()) { 237 } else if (use_fxsr()) {
238 fpu_fxsave(fpu); 238 fpu_fxsave(fpu);
239 } else { 239 } else {
240 asm volatile("fsave %[fx]; fwait" 240 asm volatile("fnsave %[fx]; fwait"
241 : [fx] "=m" (fpu->state->fsave)); 241 : [fx] "=m" (fpu->state->fsave));
242 return; 242 return;
243 } 243 }
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index fc1f579fb965..65aaa91d5850 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,6 +6,8 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9#define PIT_LATCH LATCH
10
9extern raw_spinlock_t i8253_lock; 11extern raw_spinlock_t i8253_lock;
10 12
11extern struct clock_event_device *global_clock_event; 13extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a5109..8dbe353e41e1 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
11 unsigned long page_size_mask); 11 unsigned long page_size_mask);
12 12
13 13
14extern unsigned long __initdata e820_table_start; 14extern unsigned long __initdata pgt_buf_start;
15extern unsigned long __meminitdata e820_table_end; 15extern unsigned long __meminitdata pgt_buf_end;
16extern unsigned long __meminitdata e820_table_top; 16extern unsigned long __meminitdata pgt_buf_top;
17 17
18#endif /* _ASM_X86_INIT_32_H */ 18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index f327d386d6cc..a97a240f67f3 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
63 } __attribute__ ((packed)) bits; 63 } __attribute__ ((packed)) bits;
64}; 64};
65 65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry { 66struct IO_APIC_route_entry {
78 __u32 vector : 8, 67 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED 68 delivery_mode : 3, /* 000: FIXED
@@ -106,6 +95,10 @@ struct IR_IO_APIC_route_entry {
106 index : 15; 95 index : 15;
107} __attribute__ ((packed)); 96} __attribute__ ((packed));
108 97
98#define IOAPIC_AUTO -1
99#define IOAPIC_EDGE 0
100#define IOAPIC_LEVEL 1
101
109#ifdef CONFIG_X86_IO_APIC 102#ifdef CONFIG_X86_IO_APIC
110 103
111/* 104/*
@@ -150,11 +143,6 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 143#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 144 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 145
153extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic);
157
158struct io_apic_irq_attr; 146struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 147extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 148 struct io_apic_irq_attr *irq_attr);
@@ -162,6 +150,8 @@ void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_and_gsi_init(void); 150extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
164 152
153int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -186,6 +176,8 @@ extern void __init pre_init_apic_IRQ0(void);
186 176
187extern void mp_save_irq(struct mpc_intsrc *m); 177extern void mp_save_irq(struct mpc_intsrc *m);
188 178
179extern void disable_ioapic_support(void);
180
189#else /* !CONFIG_X86_IO_APIC */ 181#else /* !CONFIG_X86_IO_APIC */
190 182
191#define io_apic_assign_pci_irqs 0 183#define io_apic_assign_pci_irqs 0
@@ -199,6 +191,26 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
199struct io_apic_irq_attr; 191struct io_apic_irq_attr;
200static inline int io_apic_set_pci_routing(struct device *dev, int irq, 192static inline int io_apic_set_pci_routing(struct device *dev, int irq,
201 struct io_apic_irq_attr *irq_attr) { return 0; } 193 struct io_apic_irq_attr *irq_attr) { return 0; }
194
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{
203 return -ENOMEM;
204}
205
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
208{
209 return -ENOMEM;
210}
211
212static inline void mp_save_irq(struct mpc_intsrc *m) { };
213static inline void disable_ioapic_support(void) { }
202#endif 214#endif
203 215
204#endif /* _ASM_X86_IO_APIC_H */ 216#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b7228268a63..615fa9061b57 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
123 int vector); 123 int vector);
124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
125 int vector); 125 int vector);
126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
130 126
131/* Avoid include hell */ 127/* Avoid include hell */
132#define NMI_VECTOR 0x02 128#define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
150} 146}
151 147
152#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
149extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
150 int vector);
151extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
152 int vector);
153extern void default_send_IPI_mask_logical(const struct cpumask *mask, 153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector); 154 int vector);
155extern void default_send_IPI_allbutself(int vector); 155extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index c704b38c57a2..ba870bb6dd8e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,9 +10,6 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
16static inline int irq_canonicalize(int irq) 13static inline int irq_canonicalize(int irq)
17{ 14{
18 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 000000000000..423bbbddf36d
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
1#ifndef __IRQ_CONTROLLER__
2#define __IRQ_CONTROLLER__
3
4struct irq_domain {
5 int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
6 u32 *out_hwirq, u32 *out_type);
7 void *priv;
8 struct device_node *controller;
9 struct list_head l;
10};
11
12#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6af0894dafb4..6e976ee3b3ef 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h>
4/* 5/*
5 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
6 * 7 *
@@ -16,8 +17,8 @@
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
20 * Vectors 238 ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
21 * 22 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 * 24 *
@@ -96,37 +97,43 @@
96#define THRESHOLD_APIC_VECTOR 0xf9 97#define THRESHOLD_APIC_VECTOR 0xf9
97#define REBOOT_VECTOR 0xf8 98#define REBOOT_VECTOR 0xf8
98 99
99/* f0-f7 used for spreading out TLB flushes: */
100#define INVALIDATE_TLB_VECTOR_END 0xf7
101#define INVALIDATE_TLB_VECTOR_START 0xf0
102#define NUM_INVALIDATE_TLB_VECTORS 8
103
104/*
105 * Local APIC timer IRQ vector is on a different priority level,
106 * to work around the 'lost local interrupt if more than 2 IRQ
107 * sources per level' errata.
108 */
109#define LOCAL_TIMER_VECTOR 0xef
110
111/* 100/*
112 * Generic system vector for platform specific use 101 * Generic system vector for platform specific use
113 */ 102 */
114#define X86_PLATFORM_IPI_VECTOR 0xed 103#define X86_PLATFORM_IPI_VECTOR 0xf7
115 104
116/* 105/*
117 * IRQ work vector: 106 * IRQ work vector:
118 */ 107 */
119#define IRQ_WORK_VECTOR 0xec 108#define IRQ_WORK_VECTOR 0xf6
120 109
121#define UV_BAU_MESSAGE 0xea 110#define UV_BAU_MESSAGE 0xf5
122 111
123/* 112/*
124 * Self IPI vector for machine checks 113 * Self IPI vector for machine checks
125 */ 114 */
126#define MCE_SELF_VECTOR 0xeb 115#define MCE_SELF_VECTOR 0xf4
127 116
128/* Xen vector callback to receive events in a HVM domain */ 117/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9 118#define XEN_HVM_EVTCHN_CALLBACK 0xf3
119
120/*
121 * Local APIC timer IRQ vector is on a different priority level,
122 * to work around the 'lost local interrupt if more than 2 IRQ
123 * sources per level' errata.
124 */
125#define LOCAL_TIMER_VECTOR 0xef
126
127/* up to 32 vectors used for spreading out TLB flushes: */
128#if NR_CPUS <= 32
129# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
130#else
131# define NUM_INVALIDATE_TLB_VECTORS (32)
132#endif
133
134#define INVALIDATE_TLB_VECTOR_END (0xee)
135#define INVALIDATE_TLB_VECTOR_START \
136 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
130 137
131#define NR_VECTORS 256 138#define NR_VECTORS 256
132 139
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index ca242d35e873..fe2cc6e105fa 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,7 +13,6 @@ enum die_val {
13 DIE_PANIC, 13 DIE_PANIC,
14 DIE_NMI, 14 DIE_NMI,
15 DIE_DIE, 15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG, 16 DIE_KERNELDEBUG,
18 DIE_TRAP, 17 DIE_TRAP,
19 DIE_GPF, 18 DIE_GPF,
@@ -27,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
27extern int __must_check __die(const char *, struct pt_regs *, long); 26extern int __must_check __die(const char *, struct pt_regs *, long);
28extern void show_registers(struct pt_regs *regs); 27extern void show_registers(struct pt_regs *regs);
29extern void show_trace(struct task_struct *t, struct pt_regs *regs, 28extern void show_trace(struct task_struct *t, struct pt_regs *regs,
30 unsigned long *sp); 29 unsigned long *sp, unsigned long bp);
31extern void __show_regs(struct pt_regs *regs, int all); 30extern void __show_regs(struct pt_regs *regs, int all);
32extern void show_regs(struct pt_regs *regs); 31extern void show_regs(struct pt_regs *regs);
33extern unsigned long oops_begin(void); 32extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8e37deb1eb38..0f5213564326 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 142 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
143 unsigned int count, struct kvm_vcpu *vcpu); 143 unsigned int count, struct kvm_vcpu *vcpu);
144 144
145 bool (*get_cached_descriptor)(struct desc_struct *desc, 145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
146 int seg, struct kvm_vcpu *vcpu); 146 int seg, struct kvm_vcpu *vcpu);
147 void (*set_cached_descriptor)(struct desc_struct *desc, 147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
148 int seg, struct kvm_vcpu *vcpu); 148 int seg, struct kvm_vcpu *vcpu);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
239 int interruptibility; 239 int interruptibility;
240 240
241 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn;
242 243
243 bool have_exception; 244 bool have_exception;
244 struct x86_exception exception; 245 struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d29187..c8af0991fdf0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
85 85
86#define ASYNC_PF_PER_VCPU 64 86#define ASYNC_PF_PER_VCPU 64
87 87
88extern spinlock_t kvm_lock; 88extern raw_spinlock_t kvm_lock;
89extern struct list_head vm_list; 89extern struct list_head vm_list;
90 90
91struct kvm_vcpu; 91struct kvm_vcpu;
@@ -255,6 +255,8 @@ struct kvm_mmu {
255 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
256 struct kvm_mmu_page *sp); 256 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq);
258 hpa_t root_hpa; 260 hpa_t root_hpa;
259 int root_level; 261 int root_level;
260 int shadow_root_level; 262 int shadow_root_level;
@@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
335 u64 *last_pte_updated; 337 u64 *last_pte_updated;
336 gfn_t last_pte_gfn; 338 gfn_t last_pte_gfn;
337 339
338 struct {
339 gfn_t gfn; /* presumed gfn during guest pte update */
340 pfn_t pfn; /* pfn corresponding to that gfn */
341 unsigned long mmu_seq;
342 } update_pte;
343
344 struct fpu guest_fpu; 340 struct fpu guest_fpu;
345 u64 xcr0; 341 u64 xcr0;
346 342
@@ -448,7 +444,7 @@ struct kvm_arch {
448 444
449 unsigned long irq_sources_bitmap; 445 unsigned long irq_sources_bitmap;
450 s64 kvmclock_offset; 446 s64 kvmclock_offset;
451 spinlock_t tsc_write_lock; 447 raw_spinlock_t tsc_write_lock;
452 u64 last_tsc_nsec; 448 u64 last_tsc_nsec;
453 u64 last_tsc_offset; 449 u64 last_tsc_offset;
454 u64 last_tsc_write; 450 u64 last_tsc_write;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea5..aeff3e89b222 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -13,6 +13,12 @@ typedef struct {
13 int size; 13 int size;
14 struct mutex lock; 14 struct mutex lock;
15 void *vdso; 15 void *vdso;
16
17#ifdef CONFIG_X86_64
18 /* True if mm supports a task running in 32 bit compatibility mode. */
19 unsigned short ia32_compat;
20#endif
21
16} mm_context_t; 22} mm_context_t;
17 23
18#ifdef CONFIG_SMP 24#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 0c90dd9f0505..9c7d95f6174b 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -25,7 +25,6 @@ extern int pic_mode;
25#define MAX_IRQ_SOURCES 256 25#define MAX_IRQ_SOURCES 256
26 26
27extern unsigned int def_to_bigsmp; 27extern unsigned int def_to_bigsmp;
28extern u8 apicid_2_node[];
29 28
30#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
31extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
33extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
34#endif 33#endif
35 34
36#define MAX_APICID 256
37
38#else /* CONFIG_X86_64: */ 35#else /* CONFIG_X86_64: */
39 36
40#define MAX_MP_BUSSES 256 37#define MAX_MP_BUSSES 256
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4d0dfa0d998e..3cce71413d0b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,14 @@
36#define MSR_IA32_PERFCTR1 0x000000c2 36#define MSR_IA32_PERFCTR1 0x000000c2
37#define MSR_FSB_FREQ 0x000000cd 37#define MSR_FSB_FREQ 0x000000cd
38 38
39#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
40#define NHM_C3_AUTO_DEMOTE (1UL << 25)
41#define NHM_C1_AUTO_DEMOTE (1UL << 26)
42#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
43
39#define MSR_MTRRcap 0x000000fe 44#define MSR_MTRRcap 0x000000fe
40#define MSR_IA32_BBL_CR_CTL 0x00000119 45#define MSR_IA32_BBL_CR_CTL 0x00000119
46#define MSR_IA32_BBL_CR_CTL3 0x0000011e
41 47
42#define MSR_IA32_SYSENTER_CS 0x00000174 48#define MSR_IA32_SYSENTER_CS 0x00000174
43#define MSR_IA32_SYSENTER_ESP 0x00000175 49#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -47,6 +53,9 @@
47#define MSR_IA32_MCG_STATUS 0x0000017a 53#define MSR_IA32_MCG_STATUS 0x0000017a
48#define MSR_IA32_MCG_CTL 0x0000017b 54#define MSR_IA32_MCG_CTL 0x0000017b
49 55
56#define MSR_OFFCORE_RSP_0 0x000001a6
57#define MSR_OFFCORE_RSP_1 0x000001a7
58
50#define MSR_IA32_PEBS_ENABLE 0x000003f1 59#define MSR_IA32_PEBS_ENABLE 0x000003f1
51#define MSR_IA32_DS_AREA 0x00000600 60#define MSR_IA32_DS_AREA 0x00000600
52#define MSR_IA32_PERF_CAPABILITIES 0x00000345 61#define MSR_IA32_PERF_CAPABILITIES 0x00000345
@@ -87,11 +96,15 @@
87#define MSR_IA32_MC0_ADDR 0x00000402 96#define MSR_IA32_MC0_ADDR 0x00000402
88#define MSR_IA32_MC0_MISC 0x00000403 97#define MSR_IA32_MC0_MISC 0x00000403
89 98
99#define MSR_AMD64_MC0_MASK 0xc0010044
100
90#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) 101#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
91#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) 102#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
92#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) 103#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
93#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) 104#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
94 105
106#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
107
95/* These are consecutive and not in the normal 4er MCE bank block */ 108/* These are consecutive and not in the normal 4er MCE bank block */
96#define MSR_IA32_MC0_CTL2 0x00000280 109#define MSR_IA32_MC0_CTL2 0x00000280
97#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 110#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c76f5b92b840..4886a68f267e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,7 +7,6 @@
7 7
8#ifdef CONFIG_X86_LOCAL_APIC 8#ifdef CONFIG_X86_LOCAL_APIC
9 9
10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 10extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
12extern int reserve_perfctr_nmi(unsigned int); 11extern int reserve_perfctr_nmi(unsigned int);
13extern void release_perfctr_nmi(unsigned int); 12extern void release_perfctr_nmi(unsigned int);
@@ -30,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void);
30 * external nmis, because the local ones are more frequent. 29 * external nmis, because the local ones are more frequent.
31 * 30 *
32 * Also setup some default high/normal/low settings for 31 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate 32 * subsystems to registers with. Using 4 bits to separate
34 * the priorities. This can go alot higher if needed be. 33 * the priorities. This can go a lot higher if needed be.
35 */ 34 */
36 35
37#define NMI_LOCAL_SHIFT 16 /* randomly picked */ 36#define NMI_LOCAL_SHIFT 16 /* randomly picked */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 6d8723a766cc..af788496020b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -38,7 +38,7 @@
38#define K8_NOP8 K8_NOP4 K8_NOP4 38#define K8_NOP8 K8_NOP4 K8_NOP4
39 39
40/* K7 nops 40/* K7 nops
41 uses eax dependencies (arbitary choice) 41 uses eax dependencies (arbitrary choice)
42 1: nop 42 1: nop
43 2: movl %eax,%eax 43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax 44 3: leal (,%eax,1),%eax
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d3138..a50fc9f493b3 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,57 @@
1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H
3
4#include <asm/topology.h>
5#include <asm/apicdef.h>
6
7#ifdef CONFIG_NUMA
8
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
10
11/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and
13 * node and is used to initialize cpu_to_node mapping.
14 *
15 * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
16 * should be accessed by the accessors - set_apicid_to_node() and
17 * numa_cpu_node().
18 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC];
20
21static inline void set_apicid_to_node(int apicid, s16 node)
22{
23 __apicid_to_node[apicid] = node;
24}
25#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node)
27{
28}
29#endif /* CONFIG_NUMA */
30
1#ifdef CONFIG_X86_32 31#ifdef CONFIG_X86_32
2# include "numa_32.h" 32# include "numa_32.h"
3#else 33#else
4# include "numa_64.h" 34# include "numa_64.h"
5#endif 35#endif
36
37#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif /* CONFIG_NUMA */
52
53#ifdef CONFIG_DEBUG_PER_CPU_MAPS
54void debug_cpumask_set_cpu(int cpu, int node, bool enable);
55#endif
56
57#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index b0ef2b449a9d..c6beed1ef103 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,7 +4,12 @@
4extern int numa_off; 4extern int numa_off;
5 5
6extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
7extern void numa_remove_cpu(int cpu); 7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
8 13
9#ifdef CONFIG_HIGHMEM 14#ifdef CONFIG_HIGHMEM
10extern void set_highmem_pages_init(void); 15extern void set_highmem_pages_init(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 0493be39607c..344eb1790b46 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -2,23 +2,16 @@
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h> 4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6 5
7struct bootnode { 6struct bootnode {
8 u64 start; 7 u64 start;
9 u64 end; 8 u64 end;
10}; 9};
11 10
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16 12
17extern void numa_init_array(void);
18extern int numa_off; 13extern int numa_off;
19 14
20extern s16 apicid_to_node[MAX_LOCAL_APIC];
21
22extern unsigned long numa_free_all_bootmem(void); 15extern unsigned long numa_free_all_bootmem(void);
23extern void setup_node_bootmem(int nodeid, unsigned long start, 16extern void setup_node_bootmem(int nodeid, unsigned long start,
24 unsigned long end); 17 unsigned long end);
@@ -31,11 +24,11 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
31 */ 24 */
32#define NODE_MIN_SIZE (4*1024*1024) 25#define NODE_MIN_SIZE (4*1024*1024)
33 26
34extern void __init init_cpu_to_node(void); 27extern nodemask_t numa_nodes_parsed __initdata;
35extern void __cpuinit numa_set_node(int cpu, int node); 28
36extern void __cpuinit numa_clear_node(int cpu); 29extern int __cpuinit numa_cpu_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
38extern void __cpuinit numa_remove_cpu(int cpu); 31extern void __init numa_set_distance(int from, int to, int distance);
39 32
40#ifdef CONFIG_NUMA_EMU 33#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
@@ -43,11 +36,7 @@ extern void __cpuinit numa_remove_cpu(int cpu);
43void numa_emu_cmdline(char *); 36void numa_emu_cmdline(char *);
44#endif /* CONFIG_NUMA_EMU */ 37#endif /* CONFIG_NUMA_EMU */
45#else 38#else
46static inline void init_cpu_to_node(void) { } 39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
47static inline void numa_set_node(int cpu, int node) { }
48static inline void numa_clear_node(int cpu) { }
49static inline void numa_add_cpu(int cpu, int node) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif 40#endif
52 41
53#endif /* _ASM_X86_NUMA_64_H */ 42#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index f482010350fb..5ca6801b75f3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
20 20
21/* 21/*
22 * OLPC board IDs contain the major build number within the mask 0x0ff0, 22 * OLPC board IDs contain the major build number within the mask 0x0ff0,
23 * and the minor build number withing 0x000f. Pre-builds have a minor 23 * and the minor build number within 0x000f. Pre-builds have a minor
24 * number less than 8, and normal builds start at 8. For example, 0x0B10 24 * number less than 8, and normal builds start at 8. For example, 0x0B10
25 * is a PreB1, and 0x0C18 is a C1. 25 * is a PreB1, and 0x0C18 is a C1.
26 */ 26 */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 641988efe063..c5d3a5abbb9f 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,7 +6,7 @@
6 6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ 7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC
10 10
11extern bool olpc_ofw_is_installed(void); 11extern bool olpc_ofw_is_installed(void);
12 12
@@ -26,19 +26,15 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC */
30
31static inline bool olpc_ofw_is_installed(void) { return false; }
32static inline void olpc_ofw_detect(void) { } 30static inline void olpc_ofw_detect(void) { }
33static inline void setup_olpc_ofw_pgd(void) { } 31static inline void setup_olpc_ofw_pgd(void) { }
34static inline bool olpc_ofw_present(void) { return false; } 32#endif /* !CONFIG_OLPC */
35
36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
37 33
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT 34#ifdef CONFIG_OF_PROMTREE
39extern void olpc_dt_build_devicetree(void); 35extern void olpc_dt_build_devicetree(void);
40#else 36#else
41static inline void olpc_dt_build_devicetree(void) { } 37static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ 38#endif
43 39
44#endif /* _ASM_X86_OLPC_OFW_H */ 40#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1df66211fd1b..bce688d54c12 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAGE_DEFS_H 2#define _ASM_X86_PAGE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h>
5 6
6/* PAGE_SHIFT determines the page size */ 7/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12 8#define PAGE_SHIFT 12
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr);
45extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
46extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
47 48
49static inline phys_addr_t get_max_mapped(void)
50{
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52}
53
48extern unsigned long init_memory_mapping(unsigned long start, 54extern unsigned long init_memory_mapping(unsigned long start,
49 unsigned long end); 55 unsigned long end);
50 56
51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, 57extern void initmem_init(void);
52 int acpi, int k8);
53extern void free_initmem(void); 58extern void free_initmem(void);
54 59
55#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 7e172955ee57..d475b4398d8b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,7 +45,7 @@
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46 46
47#ifdef CONFIG_SMP 47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 48#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
49#define __my_cpu_offset percpu_read(this_cpu_off) 49#define __my_cpu_offset percpu_read(this_cpu_off)
50 50
51/* 51/*
@@ -62,9 +62,11 @@
62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ 62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
63}) 63})
64#else 64#else
65#define __percpu_arg(x) "%P" #x 65#define __percpu_prefix ""
66#endif 66#endif
67 67
68#define __percpu_arg(x) __percpu_prefix "%P" #x
69
68/* 70/*
69 * Initialized pointers to per-cpu variables needed for the boot 71 * Initialized pointers to per-cpu variables needed for the boot
70 * processor need to use these macros to get the proper address 72 * processor need to use these macros to get the proper address
@@ -451,6 +453,26 @@ do { \
451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 453#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#endif /* !CONFIG_M386 */ 454#endif /* !CONFIG_M386 */
453 455
456#ifdef CONFIG_X86_CMPXCHG64
457#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
458({ \
459 char __ret; \
460 typeof(o1) __o1 = o1; \
461 typeof(o1) __n1 = n1; \
462 typeof(o2) __o2 = o2; \
463 typeof(o2) __n2 = n2; \
464 typeof(o2) __dummy = n2; \
465 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
466 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
467 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
468 __ret; \
469})
470
471#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
473#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
474#endif /* CONFIG_X86_CMPXCHG64 */
475
454/* 476/*
455 * Per cpu atomic 64 bit operations are only available under 64 bit. 477 * Per cpu atomic 64 bit operations are only available under 64 bit.
456 * 32 bit must fall back to generic operations. 478 * 32 bit must fall back to generic operations.
@@ -480,6 +502,34 @@ do { \
480#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 502#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
481#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 503#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
482#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 504#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
505
506/*
507 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
508 * is not supported on early AMD64 processors so we must be able to emulate
509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary.
511 */
512#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
513({ \
514 char __ret; \
515 typeof(o1) __o1 = o1; \
516 typeof(o1) __n1 = n1; \
517 typeof(o2) __o2 = o2; \
518 typeof(o2) __n2 = n2; \
519 typeof(o2) __dummy; \
520 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
521 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
522 X86_FEATURE_CX16, \
523 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
524 "S" (&pcp1), "b"(__n1), "c"(__n2), \
525 "a"(__o1), "d"(__o2) : "memory"); \
526 __ret; \
527})
528
529#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
530#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
531#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
532
483#endif 533#endif
484 534
485/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 535/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index e2f6a99f14ab..56fd9e3abbda 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 */ 3 */
4 4
5#ifndef PERF_EVENT_P4_H 5#ifndef PERF_EVENT_P4_H
@@ -9,7 +9,7 @@
9#include <linux/bitops.h> 9#include <linux/bitops.h>
10 10
11/* 11/*
12 * NetBurst has perfomance MSRs shared between 12 * NetBurst has performance MSRs shared between
13 * threads if HT is turned on, ie for both logical 13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support 14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its 15 * perf-MSRs are not shared and every thread has its
@@ -22,6 +22,7 @@
22 22
23#define ARCH_P4_CNTRVAL_BITS (40) 23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1) 24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1))
25 26
26#define P4_ESCR_EVENT_MASK 0x7e000000U 27#define P4_ESCR_EVENT_MASK 0x7e000000U
27#define P4_ESCR_EVENT_SHIFT 25 28#define P4_ESCR_EVENT_SHIFT 25
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 94b979d1b58d..effff47a3c82 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd)
69 69
70static inline void pud_clear(pud_t *pudp) 70static inline void pud_clear(pud_t *pudp)
71{ 71{
72 unsigned long pgd;
73
74 set_pud(pudp, __pud(0)); 72 set_pud(pudp, __pud(0));
75 73
76 /* 74 /*
@@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp)
79 * section 8.1: in PAE mode we explicitly have to flush the 77 * section 8.1: in PAE mode we explicitly have to flush the
80 * TLB via cr3 if the top-level pgd is changed... 78 * TLB via cr3 if the top-level pgd is changed...
81 * 79 *
82 * Make sure the pud entry we're updating is within the 80 * Currently all places where pud_clear() is called either have
83 * current pgd to avoid unnecessary TLB flushes. 81 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
82 * pud_clear_bad()), so we don't need TLB flush here.
84 */ 83 */
85 pgd = read_cr3();
86 if (__pa(pudp) >= pgd && __pa(pudp) <
87 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
88 write_cr3(pgd);
89} 84}
90 85
91#ifdef CONFIG_SMP 86#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..a898a2b6e10c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -7,7 +7,7 @@
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ 10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ 12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ 13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa186..4c25ab48257b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
94 int x86_cache_alignment; /* In bytes */ 94 int x86_cache_alignment; /* In bytes */
95 int x86_power; 95 int x86_power;
96 unsigned long loops_per_jiffy; 96 unsigned long loops_per_jiffy;
97#ifdef CONFIG_SMP
98 /* cpus sharing the last level cache: */
99 cpumask_var_t llc_shared_map;
100#endif
101 /* cpuid returned max cores value: */ 97 /* cpuid returned max cores value: */
102 u16 x86_max_cores; 98 u16 x86_max_cores;
103 u16 apicid; 99 u16 apicid;
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b4ec95f07518..971e0b46446e 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -1 +1,69 @@
1/* dummy prom.h; here to make linux/of.h's #includes happy */ 1/*
2 * Definitions for Device tree / OpenFirmware handling on X86
3 *
4 * based on arch/powerpc/include/asm/prom.h which is
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _ASM_X86_PROM_H
14#define _ASM_X86_PROM_H
15#ifndef __ASSEMBLY__
16
17#include <linux/of.h>
18#include <linux/types.h>
19#include <linux/pci.h>
20
21#include <asm/irq.h>
22#include <asm/atomic.h>
23#include <asm/setup.h>
24#include <asm/irq_controller.h>
25
26#ifdef CONFIG_OF
27extern int of_ioapic;
28extern u64 initial_dtb;
29extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else
45static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { }
47static inline void x86_of_pci_init(void) { }
48static inline void x86_dtb_init(void) { }
49#define of_ioapic 0
50#endif
51
52extern char cmd_line[COMMAND_LINE_SIZE];
53
54#define pci_address_to_pio pci_address_to_pio
55unsigned long pci_address_to_pio(phys_addr_t addr);
56
57/**
58 * irq_dispose_mapping - Unmap an interrupt
59 * @virq: linux virq number of the interrupt to unmap
60 *
61 * FIXME: We really should implement proper virq handling like power,
62 * but that's going to be major surgery.
63 */
64static inline void irq_dispose_mapping(unsigned int virq) { }
65
66#define HAVE_ARCH_DEVTREE_FIXUPS
67
68#endif /* __ASSEMBLY__ */
69#endif
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 52b098a6eebb..7b0a55a88851 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -31,7 +31,7 @@
31#define R12 24 31#define R12 24
32#define RBP 32 32#define RBP 32
33#define RBX 40 33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/ 34/* arguments: interrupts/non tracing syscalls only save up to here*/
35#define R11 48 35#define R11 48
36#define R10 56 36#define R10 56
37#define R9 64 37#define R9 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 78cd1ea94500..1babf8adecdf 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -73,7 +73,7 @@ struct pt_regs {
73 unsigned long r12; 73 unsigned long r12;
74 unsigned long rbp; 74 unsigned long rbp;
75 unsigned long rbx; 75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/ 76/* arguments: non interrupts/non tracing syscalls only save up to here*/
77 unsigned long r11; 77 unsigned long r11;
78 unsigned long r10; 78 unsigned long r10;
79 unsigned long r9; 79 unsigned long r9;
@@ -103,7 +103,7 @@ struct pt_regs {
103 unsigned long r12; 103 unsigned long r12;
104 unsigned long bp; 104 unsigned long bp;
105 unsigned long bx; 105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/ 106/* arguments: non interrupts/non tracing syscalls only save up to here*/
107 unsigned long r11; 107 unsigned long r11;
108 unsigned long r10; 108 unsigned long r10;
109 unsigned long r9; 109 unsigned long r9;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..3250e3d605d9 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,7 +18,10 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */
23#define MRR_BIOS 0
24#define MRR_APM 1
22 25
23typedef void (*nmi_shootdown_cb)(int, struct die_args*); 26typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback); 27void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b60..df4cd32b4cc6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,26 +37,9 @@
37#endif 37#endif
38 38
39#ifdef __KERNEL__ 39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44#include <asm/asm.h> 40#include <asm/asm.h>
45 41
46struct rwsem_waiter;
47
48extern asmregparm struct rw_semaphore *
49 rwsem_down_read_failed(struct rw_semaphore *sem);
50extern asmregparm struct rw_semaphore *
51 rwsem_down_write_failed(struct rw_semaphore *sem);
52extern asmregparm struct rw_semaphore *
53 rwsem_wake(struct rw_semaphore *);
54extern asmregparm struct rw_semaphore *
55 rwsem_downgrade_wake(struct rw_semaphore *sem);
56
57/* 42/*
58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of 43 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647 44 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits. 45 * for 64 bits.
@@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore *
74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 57#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
76 59
77typedef signed long rwsem_count_t;
78
79struct rw_semaphore {
80 rwsem_count_t count;
81 spinlock_t wait_lock;
82 struct list_head wait_list;
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 struct lockdep_map dep_map;
85#endif
86};
87
88#ifdef CONFIG_DEBUG_LOCK_ALLOC
89# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
90#else
91# define __RWSEM_DEP_MAP_INIT(lockname)
92#endif
93
94
95#define __RWSEM_INITIALIZER(name) \
96{ \
97 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
98 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
99}
100
101#define DECLARE_RWSEM(name) \
102 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
103
104extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
105 struct lock_class_key *key);
106
107#define init_rwsem(sem) \
108do { \
109 static struct lock_class_key __key; \
110 \
111 __init_rwsem((sem), #sem, &__key); \
112} while (0)
113
114/* 60/*
115 * lock for reading 61 * lock for reading
116 */ 62 */
@@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem)
133 */ 79 */
134static inline int __down_read_trylock(struct rw_semaphore *sem) 80static inline int __down_read_trylock(struct rw_semaphore *sem)
135{ 81{
136 rwsem_count_t result, tmp; 82 long result, tmp;
137 asm volatile("# beginning __down_read_trylock\n\t" 83 asm volatile("# beginning __down_read_trylock\n\t"
138 " mov %0,%1\n\t" 84 " mov %0,%1\n\t"
139 "1:\n\t" 85 "1:\n\t"
@@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
155 */ 101 */
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 102static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 103{
158 rwsem_count_t tmp; 104 long tmp;
159 asm volatile("# beginning down_write\n\t" 105 asm volatile("# beginning down_write\n\t"
160 LOCK_PREFIX " xadd %1,(%2)\n\t" 106 LOCK_PREFIX " xadd %1,(%2)\n\t"
161 /* adds 0xffff0001, returns the old value */ 107 /* adds 0xffff0001, returns the old value */
@@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem)
180 */ 126 */
181static inline int __down_write_trylock(struct rw_semaphore *sem) 127static inline int __down_write_trylock(struct rw_semaphore *sem)
182{ 128{
183 rwsem_count_t ret = cmpxchg(&sem->count, 129 long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
184 RWSEM_UNLOCKED_VALUE, 130 RWSEM_ACTIVE_WRITE_BIAS);
185 RWSEM_ACTIVE_WRITE_BIAS);
186 if (ret == RWSEM_UNLOCKED_VALUE) 131 if (ret == RWSEM_UNLOCKED_VALUE)
187 return 1; 132 return 1;
188 return 0; 133 return 0;
@@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
193 */ 138 */
194static inline void __up_read(struct rw_semaphore *sem) 139static inline void __up_read(struct rw_semaphore *sem)
195{ 140{
196 rwsem_count_t tmp; 141 long tmp;
197 asm volatile("# beginning __up_read\n\t" 142 asm volatile("# beginning __up_read\n\t"
198 LOCK_PREFIX " xadd %1,(%2)\n\t" 143 LOCK_PREFIX " xadd %1,(%2)\n\t"
199 /* subtracts 1, returns the old value */ 144 /* subtracts 1, returns the old value */
@@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem)
211 */ 156 */
212static inline void __up_write(struct rw_semaphore *sem) 157static inline void __up_write(struct rw_semaphore *sem)
213{ 158{
214 rwsem_count_t tmp; 159 long tmp;
215 asm volatile("# beginning __up_write\n\t" 160 asm volatile("# beginning __up_write\n\t"
216 LOCK_PREFIX " xadd %1,(%2)\n\t" 161 LOCK_PREFIX " xadd %1,(%2)\n\t"
217 /* subtracts 0xffff0001, returns the old value */ 162 /* subtracts 0xffff0001, returns the old value */
@@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
247/* 192/*
248 * implement atomic add functionality 193 * implement atomic add functionality
249 */ 194 */
250static inline void rwsem_atomic_add(rwsem_count_t delta, 195static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
251 struct rw_semaphore *sem)
252{ 196{
253 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" 197 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
254 : "+m" (sem->count) 198 : "+m" (sem->count)
@@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
258/* 202/*
259 * implement exchange and add functionality 203 * implement exchange and add functionality
260 */ 204 */
261static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, 205static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
262 struct rw_semaphore *sem)
263{ 206{
264 rwsem_count_t tmp = delta; 207 long tmp = delta;
265 208
266 asm volatile(LOCK_PREFIX "xadd %0,%1" 209 asm volatile(LOCK_PREFIX "xadd %0,%1"
267 : "+r" (tmp), "+m" (sem->count) 210 : "+r" (tmp), "+m" (sem->count)
@@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
270 return tmp + delta; 213 return tmp + delta;
271} 214}
272 215
273static inline int rwsem_is_locked(struct rw_semaphore *sem)
274{
275 return (sem->count != 0);
276}
277
278#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
279#endif /* _ASM_X86_RWSEM_H */ 217#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 231f1c1d6607..cd84f7208f76 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,14 +1,16 @@
1#ifndef _ASM_X86_SEGMENT_H 1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H 2#define _ASM_X86_SEGMENT_H
3 3
4#include <linux/const.h>
5
4/* Constructor for a conventional segment GDT (or LDT) entry */ 6/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */ 7/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \ 8#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \ 9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \ 10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \ 11 (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \ 12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
11 (((limit) & 0x0000ffffULL))) 13 (((limit) & _AC(0x0000ffff,ULL))))
12 14
13/* Simple and small GDT entries for booting only */ 15/* Simple and small GDT entries for booting only */
14 16
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1f4695136776..73b11bc0ae6f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,12 +17,24 @@
17#endif 17#endif
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/cpumask.h> 19#include <asm/cpumask.h>
20#include <asm/cpufeature.h>
20 21
21extern int smp_num_siblings; 22extern int smp_num_siblings;
22extern unsigned int num_processors; 23extern unsigned int num_processors;
23 24
25static inline bool cpu_has_ht_siblings(void)
26{
27 bool has_siblings = false;
28#ifdef CONFIG_SMP
29 has_siblings = cpu_has_ht && smp_num_siblings > 1;
30#endif
31 return has_siblings;
32}
33
24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU(int, cpu_number);
28 40
@@ -36,8 +48,16 @@ static inline struct cpumask *cpu_core_mask(int cpu)
36 return per_cpu(cpu_core_map, cpu); 48 return per_cpu(cpu_core_map, cpu);
37} 49}
38 50
51static inline struct cpumask *cpu_llc_shared_mask(int cpu)
52{
53 return per_cpu(cpu_llc_shared_map, cpu);
54}
55
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
60#endif
41 61
42/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
43extern unsigned long stack_start; /* Initial stack pointer address */ 63extern unsigned long stack_start; /* Initial stack pointer address */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 6c22bf353f26..725b77831993 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -34,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
34 */ 34 */
35 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
36 36
37 *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; 37 *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
38} 38}
39 39
40static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 52b5c7ed3608..d7e89c83645d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -47,7 +47,7 @@ struct stacktrace_ops {
47}; 47};
48 48
49void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
50 unsigned long *stack, 50 unsigned long *stack, unsigned long bp,
51 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
52 52
53#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -86,11 +86,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs)
86 86
87extern void 87extern void
88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
89 unsigned long *stack, char *log_lvl); 89 unsigned long *stack, unsigned long bp, char *log_lvl);
90 90
91extern void 91extern void
92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
93 unsigned long *sp, char *log_lvl); 93 unsigned long *sp, unsigned long bp, char *log_lvl);
94 94
95extern unsigned int code_bytes; 95extern unsigned int code_bytes;
96 96
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3ea8782..12569e691ce3 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do { \
98 */ 98 */
99#define HAVE_DISABLE_HLT 99#define HAVE_DISABLE_HLT
100#else 100#else
101#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
102#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
103 101
104/* frame pointer must be last for get_wchan */ 102/* frame pointer must be last for get_wchan */
105#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" 103#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a0..1f2e61e28981 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -161,8 +161,14 @@ struct thread_info {
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
163 163
164#define alloc_thread_info(tsk) \ 164#define alloc_thread_info_node(tsk, node) \
165 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) 165({ \
166 struct page *page = alloc_pages_node(node, THREAD_FLAGS, \
167 THREAD_ORDER); \
168 struct thread_info *ret = page ? page_address(page) : NULL; \
169 \
170 ret; \
171})
166 172
167#ifdef CONFIG_X86_32 173#ifdef CONFIG_X86_32
168 174
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc31e52..910a7084f7f2 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
47 47
48#include <asm/mpspec.h> 48#include <asm/mpspec.h>
49 49
50#ifdef CONFIG_X86_32
51
52/* Mappings between logical cpu number and node number */
53extern int cpu_to_node_map[];
54
55/* Returns the number of the node containing CPU 'cpu' */
56static inline int __cpu_to_node(int cpu)
57{
58 return cpu_to_node_map[cpu];
59}
60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
62
63#else /* CONFIG_X86_64 */
64
65/* Mappings between logical cpu number and node number */ 50/* Mappings between logical cpu number and node number */
66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 51DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
67 52
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
84 69
85#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
86 71
87#endif /* CONFIG_X86_64 */
88
89/* Mappings between node number and cpus on that node. */ 72/* Mappings between node number and cpus on that node. */
90extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 73extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
91 74
@@ -155,7 +138,7 @@ extern unsigned long node_remap_size[];
155 .balance_interval = 1, \ 138 .balance_interval = 1, \
156} 139}
157 140
158#ifdef CONFIG_X86_64_ACPI_NUMA 141#ifdef CONFIG_X86_64
159extern int __node_distance(int, int); 142extern int __node_distance(int, int);
160#define node_distance(a, b) __node_distance(a, b) 143#define node_distance(a, b) __node_distance(a, b)
161#endif 144#endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index f4500fb3b485..feca3118a73b 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,25 +3,36 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE 6#include <linux/types.h>
7#include <asm/io.h>
8
7/* 9/*
8 * Trampoline 80x86 program as an array. 10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
9 */ 13 */
10extern const unsigned char trampoline_data []; 14extern const unsigned char x86_trampoline_start [];
11extern const unsigned char trampoline_end []; 15extern const unsigned char x86_trampoline_end [];
12extern unsigned char *trampoline_base; 16extern unsigned char *x86_trampoline_base;
13 17
14extern unsigned long init_rsp; 18extern unsigned long init_rsp;
15extern unsigned long initial_code; 19extern unsigned long initial_code;
16extern unsigned long initial_gs; 20extern unsigned long initial_gs;
17 21
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
19 30
20extern unsigned long setup_trampoline(void); 31/* Address of the SMP trampoline */
21extern void __init reserve_trampoline_memory(void); 32static inline unsigned long trampoline_address(void)
22#else 33{
23static inline void reserve_trampoline_memory(void) {} 34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
24#endif /* CONFIG_X86_TRAMPOLINE */ 35}
25 36
26#endif /* __ASSEMBLY__ */ 37#endif /* __ASSEMBLY__ */
27 38
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d03..83e2efd181e2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void)
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 /* 37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
40 */ 40 */
41#ifndef CONFIG_X86_TSC 41#ifndef CONFIG_X86_TSC
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index df1da20f4534..8e8c23fef08c 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,22 +1,6 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#define dma_addr_t dma_addr_t
5
6#include <asm-generic/types.h> 4#include <asm-generic/types.h>
7 5
8#ifdef __KERNEL__
9#ifndef __ASSEMBLY__
10
11typedef u64 dma64_addr_t;
12#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
13/* DMA addresses come in 32-bit and 64-bit flavours. */
14typedef u64 dma_addr_t;
15#else
16typedef u32 dma_addr_t;
17#endif
18
19#endif /* __ASSEMBLY__ */
20#endif /* __KERNEL__ */
21
22#endif /* _ASM_X86_TYPES_H */ 6#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e8ba0e..a755ef5e5977 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,14 @@
346#define __NR_fanotify_init 338 346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339 347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340 348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
349 353
350#ifdef __KERNEL__ 354#ifdef __KERNEL__
351 355
352#define NR_syscalls 341 356#define NR_syscalls 345
353 357
354#define __ARCH_WANT_IPC_PARSE_VERSION 358#define __ARCH_WANT_IPC_PARSE_VERSION
355#define __ARCH_WANT_OLD_READDIR 359#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8a715b..160fa76bd578 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,14 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302 670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64) 671__SYSCALL(__NR_prlimit64, sys_prlimit64)
672#define __NR_name_to_handle_at 303
673__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
674#define __NR_open_by_handle_at 304
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs)
672 680
673#ifndef __NO_STUBS 681#ifndef __NO_STUBS
674#define __ARCH_WANT_OLD_READDIR 682#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index ce1d54c8a433..3e094af443c3 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -176,7 +176,7 @@ struct bau_msg_payload {
176struct bau_msg_header { 176struct bau_msg_header {
177 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 177 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
178 /* bits 5:0 */ 178 /* bits 5:0 */
179 unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ 179 unsigned int base_dest_nodeid:15; /* nasid of the */
180 /* bits 20:6 */ /* first bit in uvhub map */ 180 /* bits 20:6 */ /* first bit in uvhub map */
181 unsigned int command:8; /* message type */ 181 unsigned int command:8; /* message type */
182 /* bits 28:21 */ 182 /* bits 28:21 */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 64642ad019fb..643ebf2e2ad8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,11 +83,13 @@ struct x86_init_paging {
83 * boot cpu 83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init 84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET) 85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 * @wallclock_init: init the wallclock device
86 */ 87 */
87struct x86_init_timers { 88struct x86_init_timers {
88 void (*setup_percpu_clockev)(void); 89 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void); 90 void (*tsc_pre_init)(void);
90 void (*timer_init)(void); 91 void (*timer_init)(void);
92 void (*wallclock_init)(void);
91}; 93};
92 94
93/** 95/**
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae4025b..8508bfe52296 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
287static inline int 287static inline int
288HYPERVISOR_sched_op(int cmd, void *arg) 288HYPERVISOR_sched_op(int cmd, void *arg)
289{ 289{
290 return _hypercall2(int, sched_op_new, cmd, arg); 290 return _hypercall2(int, sched_op, cmd, arg);
291} 291}
292 292
293static inline long 293static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
422#endif 422#endif
423 423
424static inline int 424static inline int
425HYPERVISOR_suspend(unsigned long srec) 425HYPERVISOR_suspend(unsigned long start_info_mfn)
426{ 426{
427 return _hypercall3(int, sched_op, SCHEDOP_shutdown, 427 struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
428 SHUTDOWN_suspend, srec); 428
429 /*
430 * For a PV guest the tools require that the start_info mfn be
431 * present in rdx/edx when the hypercall is made. Per the
432 * hypercall calling convention this is the third hypercall
433 * argument, which is start_info_mfn here.
434 */
435 return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
429} 436}
430 437
431static inline int 438static inline int
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 1c10c88ee4e1..5d4922ad4b9b 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void);
86 * The privilege level specifies which modes may enter a trap via a software 86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows: 88 * privilege levels as follows:
89 * Level == 0: Noone may enter 89 * Level == 0: No one may enter
90 * Level == 1: Kernel may enter 90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter 91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter 92 * Level == 3: Everyone may enter
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf238a33..c61934fbf22a 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
29 29
30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
31#define INVALID_P2M_ENTRY (~0UL) 31#define INVALID_P2M_ENTRY (~0UL)
32#define FOREIGN_FRAME_BIT (1UL<<31) 32#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
33#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
33#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 34#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
35#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
34 36
35/* Maximum amount of memory we can handle in a domain in pages */ 37/* Maximum amount of memory we can handle in a domain in pages */
36#define MAX_DOMAIN_PAGES \ 38#define MAX_DOMAIN_PAGES \
@@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order;
41 43
42extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
46extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e);
44 49
45extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page); 51extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn); 52extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49 54
55#ifdef CONFIG_XEN_DEBUG_FS
56extern int p2m_dump_show(struct seq_file *m, void *v);
57#endif
50static inline unsigned long pfn_to_mfn(unsigned long pfn) 58static inline unsigned long pfn_to_mfn(unsigned long pfn)
51{ 59{
52 unsigned long mfn; 60 unsigned long mfn;
@@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
57 mfn = get_phys_to_machine(pfn); 65 mfn = get_phys_to_machine(pfn);
58 66
59 if (mfn != INVALID_P2M_ENTRY) 67 if (mfn != INVALID_P2M_ENTRY)
60 mfn &= ~FOREIGN_FRAME_BIT; 68 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
61 69
62 return mfn; 70 return mfn;
63} 71}
@@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73static inline unsigned long mfn_to_pfn(unsigned long mfn) 81static inline unsigned long mfn_to_pfn(unsigned long mfn)
74{ 82{
75 unsigned long pfn; 83 unsigned long pfn;
84 int ret = 0;
76 85
77 if (xen_feature(XENFEAT_auto_translated_physmap)) 86 if (xen_feature(XENFEAT_auto_translated_physmap))
78 return mfn; 87 return mfn;
79 88
89 if (unlikely((mfn >> machine_to_phys_order) != 0)) {
90 pfn = ~0;
91 goto try_override;
92 }
80 pfn = 0; 93 pfn = 0;
81 /* 94 /*
82 * The array access can fail (e.g., device space beyond end of RAM). 95 * The array access can fail (e.g., device space beyond end of RAM).
83 * In such cases it doesn't matter what we return (we return garbage), 96 * In such cases it doesn't matter what we return (we return garbage),
84 * but we must handle the fault without crashing! 97 * but we must handle the fault without crashing!
85 */ 98 */
86 __get_user(pfn, &machine_to_phys_mapping[mfn]); 99 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
87 100try_override:
88 /* 101 /* ret might be < 0 if there are no entries in the m2p for mfn */
89 * If this appears to be a foreign mfn (because the pfn 102 if (ret < 0)
90 * doesn't map back to the mfn), then check the local override 103 pfn = ~0;
91 * table to see if there's a better pfn to use. 104 else if (get_phys_to_machine(pfn) != mfn)
105 /*
106 * If this appears to be a foreign mfn (because the pfn
107 * doesn't map back to the mfn), then check the local override
108 * table to see if there's a better pfn to use.
109 *
110 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
111 */
112 pfn = m2p_find_override_pfn(mfn, ~0);
113
114 /*
115 * pfn is ~0 if there are no entries in the m2p for mfn or if the
116 * entry doesn't map back to the mfn and m2p_override doesn't have a
117 * valid entry for it.
92 */ 118 */
93 if (get_phys_to_machine(pfn) != mfn) 119 if (pfn == ~0 &&
94 pfn = m2p_find_override_pfn(mfn, pfn); 120 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
121 pfn = mfn;
95 122
96 return pfn; 123 return pfn;
97} 124}
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3eaf8d3..aa8620989162 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void)
27 * its own functions. 27 * its own functions.
28 */ 28 */
29struct xen_pci_frontend_ops { 29struct xen_pci_frontend_ops {
30 int (*enable_msi)(struct pci_dev *dev, int **vectors); 30 int (*enable_msi)(struct pci_dev *dev, int vectors[]);
31 void (*disable_msi)(struct pci_dev *dev); 31 void (*disable_msi)(struct pci_dev *dev);
32 int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); 32 int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
33 void (*disable_msix)(struct pci_dev *dev); 33 void (*disable_msix)(struct pci_dev *dev);
34}; 34};
35 35
36extern struct xen_pci_frontend_ops *xen_pci_frontend; 36extern struct xen_pci_frontend_ops *xen_pci_frontend;
37 37
38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, 38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
39 int **vectors) 39 int vectors[])
40{ 40{
41 if (xen_pci_frontend && xen_pci_frontend->enable_msi) 41 if (xen_pci_frontend && xen_pci_frontend->enable_msi)
42 return xen_pci_frontend->enable_msi(dev, vectors); 42 return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
48 xen_pci_frontend->disable_msi(dev); 48 xen_pci_frontend->disable_msi(dev);
49} 49}
50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, 50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
51 int **vectors, int nvec) 51 int vectors[], int nvec)
52{ 52{
53 if (xen_pci_frontend && xen_pci_frontend->enable_msix) 53 if (xen_pci_frontend && xen_pci_frontend->enable_msix)
54 return xen_pci_frontend->enable_msix(dev, vectors, nvec); 54 return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd880..7338ef2218bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
43obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o 48obj-y += resource.o
49 49
50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-y += trampoline.o trampoline_$(BITS).o
51obj-y += process.o 51obj-y += process.o
52obj-y += i387.o xsave.o 52obj-y += i387.o xsave.o
53obj-y += ptrace.o 53obj-y += ptrace.o
@@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o
55obj-$(CONFIG_IA32_EMULATION) += tls.o 55obj-$(CONFIG_IA32_EMULATION) += tls.o
56obj-y += step.o 56obj-y += step.o
57obj-$(CONFIG_INTEL_TXT) += tboot.o 57obj-$(CONFIG_INTEL_TXT) += tboot.o
58obj-$(CONFIG_ISA_DMA_API) += i8237.o
58obj-$(CONFIG_STACKTRACE) += stacktrace.o 59obj-$(CONFIG_STACKTRACE) += stacktrace.o
59obj-y += cpu/ 60obj-y += cpu/
60obj-y += acpi/ 61obj-y += acpi/
61obj-y += reboot.o 62obj-y += reboot.o
63obj-$(CONFIG_X86_32) += reboot_32.o
62obj-$(CONFIG_MCA) += mca_32.o 64obj-$(CONFIG_MCA) += mca_32.o
63obj-$(CONFIG_X86_MSR) += msr.o 65obj-$(CONFIG_X86_MSR) += msr.o
64obj-$(CONFIG_X86_CPUID) += cpuid.o 66obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -66,10 +68,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
66apm-y := apm_32.o 68apm-y := apm_32.o
67obj-$(CONFIG_APM) += apm.o 69obj-$(CONFIG_APM) += apm.o
68obj-$(CONFIG_SMP) += smp.o 70obj-$(CONFIG_SMP) += smp.o
69obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 71obj-$(CONFIG_SMP) += smpboot.o
72obj-$(CONFIG_SMP) += tsc_sync.o
70obj-$(CONFIG_SMP) += setup_percpu.o 73obj-$(CONFIG_SMP) += setup_percpu.o
71obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
73obj-$(CONFIG_X86_MPPARSE) += mpparse.o 74obj-$(CONFIG_X86_MPPARSE) += mpparse.o
74obj-y += apic/ 75obj-y += apic/
75obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 76obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -109,6 +110,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
110 111
111obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 112obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
113obj-$(CONFIG_OF) += devicetree.o
112 114
113### 115###
114# 64 bit specific files 116# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b3a71137983a..9a966c579af5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata;
72int acpi_sci_override_gsi __initdata; 72int acpi_sci_override_gsi __initdata;
73int acpi_skip_timer_override __initdata; 73int acpi_skip_timer_override __initdata;
74int acpi_use_timer_override __initdata; 74int acpi_use_timer_override __initdata;
75int acpi_fix_pin2_polarity __initdata;
75 76
76#ifdef CONFIG_X86_LOCAL_APIC 77#ifdef CONFIG_X86_LOCAL_APIC
77static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 78static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -415,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
415 return 0; 416 return 0;
416 } 417 }
417 418
418 if (acpi_skip_timer_override && 419 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
419 intsrc->source_irq == 0 && intsrc->global_irq == 2) { 420 if (acpi_skip_timer_override) {
420 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 421 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
421 return 0; 422 return 0;
423 }
424 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
425 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
426 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
427 }
422 } 428 }
423 429
424 mp_override_legacy_irq(intsrc->source_irq, 430 mp_override_legacy_irq(intsrc->source_irq,
@@ -589,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
589 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
590 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
591 return; 597 return;
592#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
593 apicid_to_node[physid] = nid;
594 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
595#else /* CONFIG_X86_32 */
596 apicid_2_node[physid] = nid;
597 cpu_to_node_map[cpu] = nid;
598#endif
599
600#endif 600#endif
601} 601}
602 602
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..ead21b663117 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 36wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 37wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 38wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 39signature: .long WAKEUP_HEADER_SIGNATURE
34 40
35 .text 41 .text
36 .globl _start
37 .code16 42 .code16
38wakeup_code: 43wakeup_code:
39_start:
40 cli
41 cld 44 cld
42 45
43 /* Apparently some dimwit BIOS programmers don't know how to 46 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,12 +80,12 @@ _start:
77 80
78 /* Check header signature... */ 81 /* Check header signature... */
79 movl signature, %eax 82 movl signature, %eax
80 cmpl $0x51ee1111, %eax 83 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 84 jne bogus_real_magic
82 85
83 /* Check we really have everything... */ 86 /* Check we really have everything... */
84 movl end_signature, %eax 87 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 88 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 89 jne bogus_real_magic
87 90
88 /* Call the C code */ 91 /* Call the C code */
@@ -147,3 +150,7 @@ wakeup_heap:
147wakeup_stack: 150wakeup_stack:
148 .space 2048 151 .space 2048
149wakeup_stack_end: 152wakeup_stack_end:
153
154 .section ".signature","a"
155end_signature:
156 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..e1828c07e79c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -35,7 +35,8 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 35extern struct wakeup_header wakeup_header;
36#endif 36#endif
37 37
38#define HEADER_OFFSET 0x3f00 38#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 39#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
40#define WAKEUP_END_SIGNATURE 0x65a22c82
40 41
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 42#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 68d1537b8c81..ff93bc1b09c3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -18,37 +18,28 @@
18#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
19#include "sleep.h" 19#include "sleep.h"
20 20
21unsigned long acpi_wakeup_address;
22unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
23 22
24/* address in low memory of the wakeup routine. */
25static unsigned long acpi_realmode;
26
27#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
28static char temp_stack[4096]; 24static char temp_stack[4096];
29#endif 25#endif
30 26
31/** 27/**
32 * acpi_save_state_mem - save kernel state 28 * acpi_suspend_lowlevel - save kernel state
33 * 29 *
34 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
35 * low memory. 31 * low memory.
36 *
37 * Note that this is too late to change acpi_wakeup_address.
38 */ 32 */
39int acpi_save_state_mem(void) 33int acpi_suspend_lowlevel(void)
40{ 34{
41 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
42 38
43 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
44 printk(KERN_ERR "Could not allocate memory during boot, "
45 "S3 disabled\n");
46 return -ENOMEM;
47 }
48 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
49 40
50 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
51 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
52 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
53 return -EINVAL; 44 return -EINVAL;
54 } 45 }
@@ -68,9 +59,7 @@ int acpi_save_state_mem(void)
68 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
69 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
70 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
71 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
72 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
73 << 16);
74 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
75 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
76 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -96,7 +85,7 @@ int acpi_save_state_mem(void)
96 header->pmode_cr3 = (u32)__pa(&initial_page_table); 85 header->pmode_cr3 = (u32)__pa(&initial_page_table);
97 saved_magic = 0x12345678; 86 saved_magic = 0x12345678;
98#else /* CONFIG_64BIT */ 87#else /* CONFIG_64BIT */
99 header->trampoline_segment = setup_trampoline() >> 4; 88 header->trampoline_segment = trampoline_address() >> 4;
100#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
101 stack_start = (unsigned long)temp_stack + sizeof(temp_stack); 90 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 91 early_gdt_descr.address =
@@ -107,56 +96,10 @@ int acpi_save_state_mem(void)
107 saved_magic = 0x123456789abcdef0L; 96 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 97#endif /* CONFIG_64BIT */
109 98
99 do_suspend_lowlevel();
110 return 0; 100 return 0;
111} 101}
112 102
113/*
114 * acpi_restore_state - undo effects of acpi_save_state_mem
115 */
116void acpi_restore_state_mem(void)
117{
118}
119
120
121/**
122 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 *
124 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages.
128 */
129void __init acpi_reserve_wakeup_memory(void)
130{
131 phys_addr_t mem;
132
133 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
134 printk(KERN_ERR
135 "ACPI: Wakeup code way too big, S3 disabled.\n");
136 return;
137 }
138
139 mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
140
141 if (mem == MEMBLOCK_ERROR) {
142 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
143 return;
144 }
145 acpi_realmode = (unsigned long) phys_to_virt(mem);
146 acpi_wakeup_address = mem;
147 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
148}
149
150int __init acpi_configure_wakeup_memory(void)
151{
152 if (acpi_realmode)
153 set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT);
154
155 return 0;
156}
157arch_initcall(acpi_configure_wakeup_memory);
158
159
160static int __init acpi_sleep_setup(char *str) 103static int __init acpi_sleep_setup(char *str)
161{ 104{
162 while ((str != NULL) && (*str != '\0')) { 105 while ((str != NULL) && (*str != '\0')) {
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
14
15extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7038b95d363f..4a234677e213 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -199,7 +199,7 @@ void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 201 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 202 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 203 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 204 Tough. Make sure you disable such features by hand. */
205 205
@@ -620,7 +620,12 @@ static int __kprobes stop_machine_text_poke(void *data)
620 flush_icache_range((unsigned long)p->addr, 620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len); 621 (unsigned long)p->addr + p->len);
622 } 622 }
623 623 /*
624 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
625 * that a core serializing instruction such as "cpuid" should be
626 * executed on _each_ core before the new instruction is made visible.
627 */
628 sync_core();
624 return 0; 629 return 0;
625} 630}
626 631
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6e11c8134158..246d727b65b7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -21,7 +21,7 @@
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
@@ -1260,7 +1260,7 @@ static void disable_iommus(void)
1260 * disable suspend until real resume implemented 1260 * disable suspend until real resume implemented
1261 */ 1261 */
1262 1262
1263static int amd_iommu_resume(struct sys_device *dev) 1263static void amd_iommu_resume(void)
1264{ 1264{
1265 struct amd_iommu *iommu; 1265 struct amd_iommu *iommu;
1266 1266
@@ -1276,11 +1276,9 @@ static int amd_iommu_resume(struct sys_device *dev)
1276 */ 1276 */
1277 amd_iommu_flush_all_devices(); 1277 amd_iommu_flush_all_devices();
1278 amd_iommu_flush_all_domains(); 1278 amd_iommu_flush_all_domains();
1279
1280 return 0;
1281} 1279}
1282 1280
1283static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1281static int amd_iommu_suspend(void)
1284{ 1282{
1285 /* disable IOMMUs to go out of the way for BIOS */ 1283 /* disable IOMMUs to go out of the way for BIOS */
1286 disable_iommus(); 1284 disable_iommus();
@@ -1288,17 +1286,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
1288 return 0; 1286 return 0;
1289} 1287}
1290 1288
1291static struct sysdev_class amd_iommu_sysdev_class = { 1289static struct syscore_ops amd_iommu_syscore_ops = {
1292 .name = "amd_iommu",
1293 .suspend = amd_iommu_suspend, 1290 .suspend = amd_iommu_suspend,
1294 .resume = amd_iommu_resume, 1291 .resume = amd_iommu_resume,
1295}; 1292};
1296 1293
1297static struct sys_device device_amd_iommu = {
1298 .id = 0,
1299 .cls = &amd_iommu_sysdev_class,
1300};
1301
1302/* 1294/*
1303 * This is the core init function for AMD IOMMU hardware in the system. 1295 * This is the core init function for AMD IOMMU hardware in the system.
1304 * This function is called from the generic x86 DMA layer initialization 1296 * This function is called from the generic x86 DMA layer initialization
@@ -1415,14 +1407,6 @@ static int __init amd_iommu_init(void)
1415 goto free; 1407 goto free;
1416 } 1408 }
1417 1409
1418 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1419 if (ret)
1420 goto free;
1421
1422 ret = sysdev_register(&device_amd_iommu);
1423 if (ret)
1424 goto free;
1425
1426 ret = amd_iommu_init_devices(); 1410 ret = amd_iommu_init_devices();
1427 if (ret) 1411 if (ret)
1428 goto free; 1412 goto free;
@@ -1441,6 +1425,8 @@ static int __init amd_iommu_init(void)
1441 1425
1442 amd_iommu_init_notifier(); 1426 amd_iommu_init_notifier();
1443 1427
1428 register_syscore_ops(&amd_iommu_syscore_ops);
1429
1444 if (iommu_pass_through) 1430 if (iommu_pass_through)
1445 goto out; 1431 goto out;
1446 1432
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 0a99f7198bc3..4c39baa8facc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,14 +12,19 @@
12 12
13static u32 *flush_words; 13static u32 *flush_words;
14 14
15struct pci_device_id amd_nb_misc_ids[] = { 15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
25 {}
26};
27
23const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { 28const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
24 { 0x00, 0x18, 0x20 }, 29 { 0x00, 0x18, 0x20 },
25 { 0xff, 0x00, 0x20 }, 30 { 0xff, 0x00, 0x20 },
@@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges;
31EXPORT_SYMBOL(amd_northbridges); 36EXPORT_SYMBOL(amd_northbridges);
32 37
33static struct pci_dev *next_northbridge(struct pci_dev *dev, 38static struct pci_dev *next_northbridge(struct pci_dev *dev,
34 struct pci_device_id *ids) 39 const struct pci_device_id *ids)
35{ 40{
36 do { 41 do {
37 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); 42 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
@@ -43,9 +48,9 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
43 48
44int amd_cache_northbridges(void) 49int amd_cache_northbridges(void)
45{ 50{
46 int i = 0; 51 u16 i = 0;
47 struct amd_northbridge *nb; 52 struct amd_northbridge *nb;
48 struct pci_dev *misc; 53 struct pci_dev *misc, *link;
49 54
50 if (amd_nb_num()) 55 if (amd_nb_num())
51 return 0; 56 return 0;
@@ -64,10 +69,12 @@ int amd_cache_northbridges(void)
64 amd_northbridges.nb = nb; 69 amd_northbridges.nb = nb;
65 amd_northbridges.num = i; 70 amd_northbridges.num = i;
66 71
67 misc = NULL; 72 link = misc = NULL;
68 for (i = 0; i != amd_nb_num(); i++) { 73 for (i = 0; i != amd_nb_num(); i++) {
69 node_to_amd_nb(i)->misc = misc = 74 node_to_amd_nb(i)->misc = misc =
70 next_northbridge(misc, amd_nb_misc_ids); 75 next_northbridge(misc, amd_nb_misc_ids);
76 node_to_amd_nb(i)->link = link =
77 next_northbridge(link, amd_nb_link_ids);
71 } 78 }
72 79
73 /* some CPU families (e.g. family 0x11) do not support GART */ 80 /* some CPU families (e.g. family 0x11) do not support GART */
@@ -85,26 +92,95 @@ int amd_cache_northbridges(void)
85 boot_cpu_data.x86_mask >= 0x1)) 92 boot_cpu_data.x86_mask >= 0x1))
86 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; 93 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
87 94
95 if (boot_cpu_data.x86 == 0x15)
96 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
97
98 /* L3 cache partitioning is supported on family 0x15 */
99 if (boot_cpu_data.x86 == 0x15)
100 amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
101
88 return 0; 102 return 0;
89} 103}
90EXPORT_SYMBOL_GPL(amd_cache_northbridges); 104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
91 105
92/* Ignores subdevice/subvendor but as far as I can figure out 106/*
93 they're useless anyways */ 107 * Ignores subdevice/subvendor but as far as I can figure out
94int __init early_is_amd_nb(u32 device) 108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
95{ 111{
96 struct pci_device_id *id; 112 const struct pci_device_id *id;
97 u32 vendor = device & 0xffff; 113 u32 vendor = device & 0xffff;
114
98 device >>= 16; 115 device >>= 16;
99 for (id = amd_nb_misc_ids; id->vendor; id++) 116 for (id = amd_nb_misc_ids; id->vendor; id++)
100 if (vendor == id->vendor && device == id->device) 117 if (vendor == id->vendor && device == id->device)
101 return 1; 118 return true;
119 return false;
120}
121
122int amd_get_subcaches(int cpu)
123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask;
126 int cuid = 0;
127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0;
130
131 pci_read_config_dword(link, 0x1d4, &mask);
132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf;
137}
138
139int amd_set_subcaches(int cpu, int mask)
140{
141 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg;
144 int cuid = 0;
145
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL;
148
149 /* if necessary, collect reset state of L3 partitioning and BAN mode */
150 if (reset == 0) {
151 pci_read_config_dword(nb->link, 0x1d4, &reset);
152 pci_read_config_dword(nb->misc, 0x1b8, &ban);
153 ban &= 0x180000;
154 }
155
156 /* deactivate BAN mode if any subcaches are to be disabled */
157 if (mask != 0xf) {
158 pci_read_config_dword(nb->misc, 0x1b8, &reg);
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 }
161
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26;
167
168 pci_write_config_dword(nb->link, 0x1d4, mask);
169
170 /* reset BAN mode if L3 partitioning returned to reset state */
171 pci_read_config_dword(nb->link, 0x1d4, &reg);
172 if (reg == reset) {
173 pci_read_config_dword(nb->misc, 0x1b8, &reg);
174 reg &= ~0x180000;
175 pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
176 }
177
102 return 0; 178 return 0;
103} 179}
104 180
105int amd_cache_gart(void) 181static int amd_cache_gart(void)
106{ 182{
107 int i; 183 u16 i;
108 184
109 if (!amd_nb_has_feature(AMD_NB_GART)) 185 if (!amd_nb_has_feature(AMD_NB_GART))
110 return 0; 186 return 0;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 29ebf5a3b192..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -283,7 +283,7 @@ static int __init apbt_clockevent_register(void)
283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
284 284
285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
286 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 286 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
287 global_clock_event = &adev->evt; 287 global_clock_event = &adev->evt;
288 printk(KERN_DEBUG "%s clockevent registered as global\n", 288 printk(KERN_DEBUG "%s clockevent registered as global\n",
289 global_clock_event->name); 289 global_clock_event->name);
@@ -315,7 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
315 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); 315 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); 316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
317 /* APB timer irqs are set up as mp_irqs, timer is edge type */ 317 /* APB timer irqs are set up as mp_irqs, timer is edge type */
318 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); 318 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
319 319
320 if (system_state == SYSTEM_BOOTING) { 320 if (system_state == SYSTEM_BOOTING) {
321 if (request_irq(adev->irq, apbt_interrupt_handler, 321 if (request_irq(adev->irq, apbt_interrupt_handler,
@@ -507,64 +507,12 @@ static int apbt_next_event(unsigned long delta,
507 return 0; 507 return 0;
508} 508}
509 509
510/*
511 * APB timer clock is not in sync with pclk on Langwell, which translates to
512 * unreliable read value caused by sampling error. the error does not add up
513 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
514 * would go backwards. the following code is trying to prevent time traveling
515 * backwards. little bit paranoid.
516 */
517static cycle_t apbt_read_clocksource(struct clocksource *cs) 510static cycle_t apbt_read_clocksource(struct clocksource *cs)
518{ 511{
519 unsigned long t0, t1, t2; 512 unsigned long current_count;
520 static unsigned long last_read; 513
521 514 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
522bad_count: 515 return (cycle_t)~current_count;
523 t1 = apbt_readl(phy_cs_timer_id,
524 APBTMR_N_CURRENT_VALUE);
525 t2 = apbt_readl(phy_cs_timer_id,
526 APBTMR_N_CURRENT_VALUE);
527 if (unlikely(t1 < t2)) {
528 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
529 t1, t2, t2 - t1);
530 goto bad_count;
531 }
532 /*
533 * check against cached last read, makes sure time does not go back.
534 * it could be a normal rollover but we will do tripple check anyway
535 */
536 if (unlikely(t2 > last_read)) {
537 /* check if we have a normal rollover */
538 unsigned long raw_intr_status =
539 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
540 /*
541 * cs timer interrupt is masked but raw intr bit is set if
542 * rollover occurs. then we read EOI reg to clear it.
543 */
544 if (raw_intr_status & (1 << phy_cs_timer_id)) {
545 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
546 goto out;
547 }
548 pr_debug("APB CS going back %lx:%lx:%lx ",
549 t2, last_read, t2 - last_read);
550bad_count_x3:
551 pr_debug("triple check enforced\n");
552 t0 = apbt_readl(phy_cs_timer_id,
553 APBTMR_N_CURRENT_VALUE);
554 udelay(1);
555 t1 = apbt_readl(phy_cs_timer_id,
556 APBTMR_N_CURRENT_VALUE);
557 udelay(1);
558 t2 = apbt_readl(phy_cs_timer_id,
559 APBTMR_N_CURRENT_VALUE);
560 if ((t2 > t1) || (t1 > t0)) {
561 printk(KERN_ERR "Error: APB CS tripple check failed\n");
562 goto bad_count_x3;
563 }
564 }
565out:
566 last_read = t2;
567 return (cycle_t)~t2;
568} 516}
569 517
570static int apbt_clocksource_register(void) 518static int apbt_clocksource_register(void)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5955a7800a96..73fb469908c6 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
57static u32 __init allocate_aperture(void) 57static u32 __init allocate_aperture(void)
58{ 58{
59 u32 aper_size; 59 u32 aper_size;
60 void *p; 60 unsigned long addr;
61 61
62 /* aper_size should <= 1G */ 62 /* aper_size should <= 1G */
63 if (fallback_aper_order > 5) 63 if (fallback_aper_order > 5)
@@ -73,7 +73,7 @@ static u32 __init allocate_aperture(void)
73 /* 73 /*
74 * using 512M as goal, in case kexec will load kernel_big 74 * using 512M as goal, in case kexec will load kernel_big
75 * that will do the on position decompress, and could overlap with 75 * that will do the on position decompress, and could overlap with
76 * that positon with gart that is used. 76 * that position with gart that is used.
77 * sequende: 77 * sequende:
78 * kernel_small 78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
@@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void)
83 * so don't use 512M below as gart iommu, leave the space for kernel 83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe 84 * code for safe
85 */ 85 */
86 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10);
91 return 0;
92 }
93 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
87 /* 94 /*
88 * Kmemleak should not scan this block as it may not be mapped via the 95 * Kmemleak should not scan this block as it may not be mapped via the
89 * kernel direct mapping. 96 * kernel direct mapping.
90 */ 97 */
91 kmemleak_ignore(p); 98 kmemleak_ignore(phys_to_virt(addr));
92 if (!p || __pa(p)+aper_size > 0xffffffff) {
93 printk(KERN_ERR
94 "Cannot allocate aperture memory hole (%p,%uK)\n",
95 p, aper_size>>10);
96 if (p)
97 free_bootmem(__pa(p), aper_size);
98 return 0;
99 }
100 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 99 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
101 aper_size >> 10, __pa(p)); 100 aper_size >> 10, addr);
102 insert_aperture_resource((u32)__pa(p), aper_size); 101 insert_aperture_resource((u32)addr, aper_size);
103 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 102 register_nosave_region(addr >> PAGE_SHIFT,
104 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 103 (addr+aper_size) >> PAGE_SHIFT);
105 104
106 return (u32)__pa(p); 105 return (u32)addr;
107} 106}
108 107
109 108
@@ -500,7 +499,7 @@ out:
500 * Don't enable translation yet but enable GART IO and CPU 499 * Don't enable translation yet but enable GART IO and CPU
501 * accesses and set DISTLBWALKPRB since GART table memory is UC. 500 * accesses and set DISTLBWALKPRB since GART table memory is UC.
502 */ 501 */
503 u32 ctl = DISTLBWALKPRB | aper_order << 1; 502 u32 ctl = aper_order << 1;
504 503
505 bus = amd_nb_bus_dev_ranges[i].bus; 504 bus = amd_nb_bus_dev_ranges[i].bus;
506 dev_base = amd_nb_bus_dev_ranges[i].dev_base; 505 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d74978a..fabf01eff771 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,7 +24,7 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/sysdev.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/dmar.h> 30#include <linux/dmar.h>
@@ -43,6 +43,7 @@
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
46#include <asm/desc.h> 47#include <asm/desc.h>
47#include <asm/hpet.h> 48#include <asm/hpet.h>
48#include <asm/idle.h> 49#include <asm/idle.h>
@@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
79 80
80#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
81/* 91/*
82 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
83 * 93 *
84 * +1=force-enable 94 * +1=force-enable
85 */ 95 */
86static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
87/* 97/*
88 * APIC command line parameters 98 * APIC command line parameters
89 */ 99 */
@@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
153unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
154int disable_apic; 164int disable_apic;
155/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
156static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
157/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
158int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
159EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -177,29 +187,8 @@ static struct resource lapic_resource = {
177 187
178static unsigned int calibration_result; 188static unsigned int calibration_result;
179 189
180static int lapic_next_event(unsigned long delta,
181 struct clock_event_device *evt);
182static void lapic_timer_setup(enum clock_event_mode mode,
183 struct clock_event_device *evt);
184static void lapic_timer_broadcast(const struct cpumask *mask);
185static void apic_pm_activate(void); 190static void apic_pm_activate(void);
186 191
187/*
188 * The local apic timer can be used for any function which is CPU local.
189 */
190static struct clock_event_device lapic_clockevent = {
191 .name = "lapic",
192 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
193 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
194 .shift = 32,
195 .set_mode = lapic_timer_setup,
196 .set_next_event = lapic_next_event,
197 .broadcast = lapic_timer_broadcast,
198 .rating = 100,
199 .irq = -1,
200};
201static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
202
203static unsigned long apic_phys; 192static unsigned long apic_phys;
204 193
205/* 194/*
@@ -238,7 +227,7 @@ static int modern_apic(void)
238 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
239 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
240 */ 229 */
241void apic_disable(void) 230static void __init apic_disable(void)
242{ 231{
243 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
244 apic = &apic_noop; 233 apic = &apic_noop;
@@ -282,23 +271,6 @@ u64 native_apic_icr_read(void)
282 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
283} 272}
284 273
285/**
286 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
287 */
288void __cpuinit enable_NMI_through_LVT0(void)
289{
290 unsigned int v;
291
292 /* unmask and set to NMI */
293 v = APIC_DM_NMI;
294
295 /* Level triggered for 82489DX (32bit mode) */
296 if (!lapic_is_integrated())
297 v |= APIC_LVT_LEVEL_TRIGGER;
298
299 apic_write(APIC_LVT0, v);
300}
301
302#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
303/** 275/**
304 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
508#endif 480#endif
509} 481}
510 482
483
484/*
485 * The local apic timer can be used for any function which is CPU local.
486 */
487static struct clock_event_device lapic_clockevent = {
488 .name = "lapic",
489 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
490 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
491 .shift = 32,
492 .set_mode = lapic_timer_setup,
493 .set_next_event = lapic_next_event,
494 .broadcast = lapic_timer_broadcast,
495 .rating = 100,
496 .irq = -1,
497};
498static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
499
511/* 500/*
512 * Setup the local APIC timer for this CPU. Copy the initialized values 501 * Setup the local APIC timer for this CPU. Copy the initialized values
513 * of the boot CPU and register the clock event in the framework. 502 * of the boot CPU and register the clock event in the framework.
@@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void)
1209 rdtscll(tsc); 1198 rdtscll(tsc);
1210 1199
1211 if (disable_apic) { 1200 if (disable_apic) {
1212 arch_disable_smp_support(); 1201 disable_ioapic_support();
1213 return; 1202 return;
1214 } 1203 }
1215 1204
@@ -1237,6 +1226,19 @@ void __cpuinit setup_local_APIC(void)
1237 */ 1226 */
1238 apic->init_apic_ldr(); 1227 apic->init_apic_ldr();
1239 1228
1229#ifdef CONFIG_X86_32
1230 /*
1231 * APIC LDR is initialized. If logical_apicid mapping was
1232 * initialized during get_smp_config(), make sure it matches the
1233 * actual value.
1234 */
1235 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1236 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id();
1240#endif
1241
1240 /* 1242 /*
1241 * Set Task Priority to 'accept all'. We never change this 1243 * Set Task Priority to 'accept all'. We never change this
1242 * later on. 1244 * later on.
@@ -1448,7 +1450,7 @@ int __init enable_IR(void)
1448void __init enable_IR_x2apic(void) 1450void __init enable_IR_x2apic(void)
1449{ 1451{
1450 unsigned long flags; 1452 unsigned long flags;
1451 struct IO_APIC_route_entry **ioapic_entries = NULL; 1453 struct IO_APIC_route_entry **ioapic_entries;
1452 int ret, x2apic_enabled = 0; 1454 int ret, x2apic_enabled = 0;
1453 int dmar_table_init_ret; 1455 int dmar_table_init_ret;
1454 1456
@@ -1537,7 +1539,7 @@ static int __init detect_init_APIC(void)
1537} 1539}
1538#else 1540#else
1539 1541
1540static int apic_verify(void) 1542static int __init apic_verify(void)
1541{ 1543{
1542 u32 features, h, l; 1544 u32 features, h, l;
1543 1545
@@ -1562,7 +1564,7 @@ static int apic_verify(void)
1562 return 0; 1564 return 0;
1563} 1565}
1564 1566
1565int apic_force_enable(void) 1567int __init apic_force_enable(unsigned long addr)
1566{ 1568{
1567 u32 h, l; 1569 u32 h, l;
1568 1570
@@ -1578,7 +1580,7 @@ int apic_force_enable(void)
1578 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1580 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1579 pr_info("Local APIC disabled by BIOS -- reenabling.\n"); 1581 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1580 l &= ~MSR_IA32_APICBASE_BASE; 1582 l &= ~MSR_IA32_APICBASE_BASE;
1581 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1583 l |= MSR_IA32_APICBASE_ENABLE | addr;
1582 wrmsr(MSR_IA32_APICBASE, l, h); 1584 wrmsr(MSR_IA32_APICBASE, l, h);
1583 enabled_via_apicbase = 1; 1585 enabled_via_apicbase = 1;
1584 } 1586 }
@@ -1619,7 +1621,7 @@ static int __init detect_init_APIC(void)
1619 "you can enable it with \"lapic\"\n"); 1621 "you can enable it with \"lapic\"\n");
1620 return -1; 1622 return -1;
1621 } 1623 }
1622 if (apic_force_enable()) 1624 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1623 return -1; 1625 return -1;
1624 } else { 1626 } else {
1625 if (apic_verify()) 1627 if (apic_verify())
@@ -1930,17 +1932,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1930{ 1932{
1931 int cpu; 1933 int cpu;
1932 1934
1933 /*
1934 * Validate version
1935 */
1936 if (version == 0x0) {
1937 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1938 "fixing up to 0x10. (tell your hw vendor)\n",
1939 version);
1940 version = 0x10;
1941 }
1942 apic_version[apicid] = version;
1943
1944 if (num_processors >= nr_cpu_ids) { 1935 if (num_processors >= nr_cpu_ids) {
1945 int max = nr_cpu_ids; 1936 int max = nr_cpu_ids;
1946 int thiscpu = max + disabled_cpus; 1937 int thiscpu = max + disabled_cpus;
@@ -1954,22 +1945,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1954 } 1945 }
1955 1946
1956 num_processors++; 1947 num_processors++;
1957 cpu = cpumask_next_zero(-1, cpu_present_mask);
1958
1959 if (version != apic_version[boot_cpu_physical_apicid])
1960 WARN_ONCE(1,
1961 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1962 apic_version[boot_cpu_physical_apicid], cpu, version);
1963
1964 physid_set(apicid, phys_cpu_present_map);
1965 if (apicid == boot_cpu_physical_apicid) { 1948 if (apicid == boot_cpu_physical_apicid) {
1966 /* 1949 /*
1967 * x86_bios_cpu_apicid is required to have processors listed 1950 * x86_bios_cpu_apicid is required to have processors listed
1968 * in same order as logical cpu numbers. Hence the first 1951 * in same order as logical cpu numbers. Hence the first
1969 * entry is BSP, and so on. 1952 * entry is BSP, and so on.
1953 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1954 * for BSP.
1970 */ 1955 */
1971 cpu = 0; 1956 cpu = 0;
1957 } else
1958 cpu = cpumask_next_zero(-1, cpu_present_mask);
1959
1960 /*
1961 * Validate version
1962 */
1963 if (version == 0x0) {
1964 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1965 cpu, apicid);
1966 version = 0x10;
1967 }
1968 apic_version[apicid] = version;
1969
1970 if (version != apic_version[boot_cpu_physical_apicid]) {
1971 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1972 apic_version[boot_cpu_physical_apicid], cpu, version);
1972 } 1973 }
1974
1975 physid_set(apicid, phys_cpu_present_map);
1973 if (apicid > max_physical_apicid) 1976 if (apicid > max_physical_apicid)
1974 max_physical_apicid = apicid; 1977 max_physical_apicid = apicid;
1975 1978
@@ -1977,7 +1980,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1977 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1980 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1978 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1981 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1979#endif 1982#endif
1980 1983#ifdef CONFIG_X86_32
1984 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1985 apic->x86_32_early_logical_apicid(cpu);
1986#endif
1981 set_cpu_possible(cpu, true); 1987 set_cpu_possible(cpu, true);
1982 set_cpu_present(cpu, true); 1988 set_cpu_present(cpu, true);
1983} 1989}
@@ -1998,10 +2004,14 @@ void default_init_apic_ldr(void)
1998} 2004}
1999 2005
2000#ifdef CONFIG_X86_32 2006#ifdef CONFIG_X86_32
2001int default_apicid_to_node(int logical_apicid) 2007int default_x86_32_numa_cpu_node(int cpu)
2002{ 2008{
2003#ifdef CONFIG_SMP 2009#ifdef CONFIG_NUMA
2004 return apicid_2_node[hard_smp_processor_id()]; 2010 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2011
2012 if (apicid != BAD_APICID)
2013 return __apicid_to_node[apicid];
2014 return NUMA_NO_NODE;
2005#else 2015#else
2006 return 0; 2016 return 0;
2007#endif 2017#endif
@@ -2036,7 +2046,7 @@ static struct {
2036 unsigned int apic_thmr; 2046 unsigned int apic_thmr;
2037} apic_pm_state; 2047} apic_pm_state;
2038 2048
2039static int lapic_suspend(struct sys_device *dev, pm_message_t state) 2049static int lapic_suspend(void)
2040{ 2050{
2041 unsigned long flags; 2051 unsigned long flags;
2042 int maxlvt; 2052 int maxlvt;
@@ -2074,23 +2084,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2074 return 0; 2084 return 0;
2075} 2085}
2076 2086
2077static int lapic_resume(struct sys_device *dev) 2087static void lapic_resume(void)
2078{ 2088{
2079 unsigned int l, h; 2089 unsigned int l, h;
2080 unsigned long flags; 2090 unsigned long flags;
2081 int maxlvt; 2091 int maxlvt, ret;
2082 int ret = 0;
2083 struct IO_APIC_route_entry **ioapic_entries = NULL; 2092 struct IO_APIC_route_entry **ioapic_entries = NULL;
2084 2093
2085 if (!apic_pm_state.active) 2094 if (!apic_pm_state.active)
2086 return 0; 2095 return;
2087 2096
2088 local_irq_save(flags); 2097 local_irq_save(flags);
2089 if (intr_remapping_enabled) { 2098 if (intr_remapping_enabled) {
2090 ioapic_entries = alloc_ioapic_entries(); 2099 ioapic_entries = alloc_ioapic_entries();
2091 if (!ioapic_entries) { 2100 if (!ioapic_entries) {
2092 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2101 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
2093 ret = -ENOMEM;
2094 goto restore; 2102 goto restore;
2095 } 2103 }
2096 2104
@@ -2152,8 +2160,6 @@ static int lapic_resume(struct sys_device *dev)
2152 } 2160 }
2153restore: 2161restore:
2154 local_irq_restore(flags); 2162 local_irq_restore(flags);
2155
2156 return ret;
2157} 2163}
2158 2164
2159/* 2165/*
@@ -2161,17 +2167,11 @@ restore:
2161 * are needed on every CPU up until machine_halt/restart/poweroff. 2167 * are needed on every CPU up until machine_halt/restart/poweroff.
2162 */ 2168 */
2163 2169
2164static struct sysdev_class lapic_sysclass = { 2170static struct syscore_ops lapic_syscore_ops = {
2165 .name = "lapic",
2166 .resume = lapic_resume, 2171 .resume = lapic_resume,
2167 .suspend = lapic_suspend, 2172 .suspend = lapic_suspend,
2168}; 2173};
2169 2174
2170static struct sys_device device_lapic = {
2171 .id = 0,
2172 .cls = &lapic_sysclass,
2173};
2174
2175static void __cpuinit apic_pm_activate(void) 2175static void __cpuinit apic_pm_activate(void)
2176{ 2176{
2177 apic_pm_state.active = 1; 2177 apic_pm_state.active = 1;
@@ -2179,16 +2179,11 @@ static void __cpuinit apic_pm_activate(void)
2179 2179
2180static int __init init_lapic_sysfs(void) 2180static int __init init_lapic_sysfs(void)
2181{ 2181{
2182 int error;
2183
2184 if (!cpu_has_apic)
2185 return 0;
2186 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 2182 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
2183 if (cpu_has_apic)
2184 register_syscore_ops(&lapic_syscore_ops);
2187 2185
2188 error = sysdev_class_register(&lapic_sysclass); 2186 return 0;
2189 if (!error)
2190 error = sysdev_register(&device_lapic);
2191 return error;
2192} 2187}
2193 2188
2194/* local apic needs to resume before other devices access its registers. */ 2189/* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..5652d31fe108 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,8 +185,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 185 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 186 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 187 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 188 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 189 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 190 .setup_portio_remap = NULL,
@@ -337,8 +335,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 335 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 336 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 337 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 338 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 339 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 340 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..f1baa2dc087a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v)
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131} 120}
132 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
133struct apic apic_noop = { 130struct apic apic_noop = {
134 .name = "noop", 131 .name = "noop",
135 .probe = noop_probe, 132 .probe = noop_probe,
@@ -153,9 +150,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 150 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 151 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 152 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 153
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 154 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 155 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 156
@@ -197,4 +192,9 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 192 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 193 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 194 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
195
196#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif
200}; 200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..541a2e431659 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,7 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
259}; 257};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..3e9de4854c5b 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid) 513static int es7000_numa_cpu_node(int cpu)
508{ 514{
509 return 0; 515 return 0;
510} 516}
511 517
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 518static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 519{
515 if (!mps_cpu) 520 if (!mps_cpu)
@@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 533 ++cpu_id;
529} 534}
530 535
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 536static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 537{
545 /* For clustered we don't have a good way to do this yet - hack */ 538 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 554 * The cpus in the mask must all be on the apic cluster.
562 */ 555 */
563 for_each_cpu(cpu, cpumask) { 556 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 557 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 558
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 559 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 560 WARN(1, "Not a valid mask!");
@@ -578,7 +571,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 571es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 572 const struct cpumask *andmask)
580{ 573{
581 int apicid = es7000_cpu_to_logical_apicid(0); 574 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 575 cpumask_var_t cpumask;
583 576
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 577 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 648 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 649 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 650 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 651 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 652 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 653 .setup_portio_remap = NULL,
@@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 686 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 687 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
698}; 692};
699 693
700struct apic __refdata apic_es7000 = { 694struct apic __refdata apic_es7000 = {
@@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 714 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 715 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 716 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 717 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 718 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 719 .setup_portio_remap = NULL,
@@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 750 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 751 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
761}; 756};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79fd43ca6f96..5260fe91bcb6 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,6 +16,7 @@
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/delay.h>
19 20
20#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
21u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(void)
@@ -83,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
83 arch_spin_lock(&lock); 84 arch_spin_lock(&lock);
84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 85 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
85 show_regs(regs); 86 show_regs(regs);
86 dump_stack();
87 arch_spin_unlock(&lock); 87 arch_spin_unlock(&lock);
88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
89 return NOTIFY_STOP; 89 return NOTIFY_STOP;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ca9e2a3545a9..45fd33d1fd3a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/sysdev.h> 33#include <linux/syscore_ops.h>
34#include <linux/msi.h> 34#include <linux/msi.h>
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
@@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
108 108
109int skip_ioapic_setup; 109int skip_ioapic_setup;
110 110
111void arch_disable_smp_support(void) 111/**
112 * disable_ioapic_support() - disables ioapic support at runtime
113 */
114void disable_ioapic_support(void)
112{ 115{
113#ifdef CONFIG_PCI 116#ifdef CONFIG_PCI
114 noioapicquirk = 1; 117 noioapicquirk = 1;
@@ -120,11 +123,14 @@ void arch_disable_smp_support(void)
120static int __init parse_noapic(char *str) 123static int __init parse_noapic(char *str)
121{ 124{
122 /* disable IO-APIC */ 125 /* disable IO-APIC */
123 arch_disable_smp_support(); 126 disable_ioapic_support();
124 return 0; 127 return 0;
125} 128}
126early_param("noapic", parse_noapic); 129early_param("noapic", parse_noapic);
127 130
131static int io_apic_setup_irq_pin(unsigned int irq, int node,
132 struct io_apic_irq_attr *attr);
133
128/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ 134/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
129void mp_save_irq(struct mpc_intsrc *m) 135void mp_save_irq(struct mpc_intsrc *m)
130{ 136{
@@ -181,7 +187,7 @@ int __init arch_early_irq_init(void)
181 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); 187 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
182 188
183 for (i = 0; i < count; i++) { 189 for (i = 0; i < count; i++) {
184 set_irq_chip_data(i, &cfg[i]); 190 irq_set_chip_data(i, &cfg[i]);
185 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); 191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
186 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
187 /* 193 /*
@@ -200,7 +206,7 @@ int __init arch_early_irq_init(void)
200#ifdef CONFIG_SPARSE_IRQ 206#ifdef CONFIG_SPARSE_IRQ
201static struct irq_cfg *irq_cfg(unsigned int irq) 207static struct irq_cfg *irq_cfg(unsigned int irq)
202{ 208{
203 return get_irq_chip_data(irq); 209 return irq_get_chip_data(irq);
204} 210}
205 211
206static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) 212static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
@@ -226,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
226{ 232{
227 if (!cfg) 233 if (!cfg)
228 return; 234 return;
229 set_irq_chip_data(at, NULL); 235 irq_set_chip_data(at, NULL);
230 free_cpumask_var(cfg->domain); 236 free_cpumask_var(cfg->domain);
231 free_cpumask_var(cfg->old_domain); 237 free_cpumask_var(cfg->old_domain);
232 kfree(cfg); 238 kfree(cfg);
@@ -256,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
256 if (res < 0) { 262 if (res < 0) {
257 if (res != -EEXIST) 263 if (res != -EEXIST)
258 return NULL; 264 return NULL;
259 cfg = get_irq_chip_data(at); 265 cfg = irq_get_chip_data(at);
260 if (cfg) 266 if (cfg)
261 return cfg; 267 return cfg;
262 } 268 }
263 269
264 cfg = alloc_irq_cfg(at, node); 270 cfg = alloc_irq_cfg(at, node);
265 if (cfg) 271 if (cfg)
266 set_irq_chip_data(at, cfg); 272 irq_set_chip_data(at, cfg);
267 else 273 else
268 irq_free_desc(at); 274 irq_free_desc(at);
269 return cfg; 275 return cfg;
@@ -818,7 +824,7 @@ static int EISA_ELCR(unsigned int irq)
818#define default_MCA_trigger(idx) (1) 824#define default_MCA_trigger(idx) (1)
819#define default_MCA_polarity(idx) default_ISA_polarity(idx) 825#define default_MCA_polarity(idx) default_ISA_polarity(idx)
820 826
821static int MPBIOS_polarity(int idx) 827static int irq_polarity(int idx)
822{ 828{
823 int bus = mp_irqs[idx].srcbus; 829 int bus = mp_irqs[idx].srcbus;
824 int polarity; 830 int polarity;
@@ -860,7 +866,7 @@ static int MPBIOS_polarity(int idx)
860 return polarity; 866 return polarity;
861} 867}
862 868
863static int MPBIOS_trigger(int idx) 869static int irq_trigger(int idx)
864{ 870{
865 int bus = mp_irqs[idx].srcbus; 871 int bus = mp_irqs[idx].srcbus;
866 int trigger; 872 int trigger;
@@ -932,16 +938,6 @@ static int MPBIOS_trigger(int idx)
932 return trigger; 938 return trigger;
933} 939}
934 940
935static inline int irq_polarity(int idx)
936{
937 return MPBIOS_polarity(idx);
938}
939
940static inline int irq_trigger(int idx)
941{
942 return MPBIOS_trigger(idx);
943}
944
945static int pin_2_irq(int idx, int apic, int pin) 941static int pin_2_irq(int idx, int apic, int pin)
946{ 942{
947 int irq; 943 int irq;
@@ -1189,7 +1185,7 @@ void __setup_vector_irq(int cpu)
1189 raw_spin_lock(&vector_lock); 1185 raw_spin_lock(&vector_lock);
1190 /* Mark the inuse vectors */ 1186 /* Mark the inuse vectors */
1191 for_each_active_irq(irq) { 1187 for_each_active_irq(irq) {
1192 cfg = get_irq_chip_data(irq); 1188 cfg = irq_get_chip_data(irq);
1193 if (!cfg) 1189 if (!cfg)
1194 continue; 1190 continue;
1195 /* 1191 /*
@@ -1220,10 +1216,6 @@ void __setup_vector_irq(int cpu)
1220static struct irq_chip ioapic_chip; 1216static struct irq_chip ioapic_chip;
1221static struct irq_chip ir_ioapic_chip; 1217static struct irq_chip ir_ioapic_chip;
1222 1218
1223#define IOAPIC_AUTO -1
1224#define IOAPIC_EDGE 0
1225#define IOAPIC_LEVEL 1
1226
1227#ifdef CONFIG_X86_32 1219#ifdef CONFIG_X86_32
1228static inline int IO_APIC_irq_trigger(int irq) 1220static inline int IO_APIC_irq_trigger(int irq)
1229{ 1221{
@@ -1248,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq)
1248} 1240}
1249#endif 1241#endif
1250 1242
1251static void ioapic_register_intr(unsigned int irq, unsigned long trigger) 1243static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1244 unsigned long trigger)
1252{ 1245{
1246 struct irq_chip *chip = &ioapic_chip;
1247 irq_flow_handler_t hdl;
1248 bool fasteoi;
1253 1249
1254 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1255 trigger == IOAPIC_LEVEL) 1251 trigger == IOAPIC_LEVEL) {
1256 irq_set_status_flags(irq, IRQ_LEVEL); 1252 irq_set_status_flags(irq, IRQ_LEVEL);
1257 else 1253 fasteoi = true;
1254 } else {
1258 irq_clear_status_flags(irq, IRQ_LEVEL); 1255 irq_clear_status_flags(irq, IRQ_LEVEL);
1256 fasteoi = false;
1257 }
1259 1258
1260 if (irq_remapped(get_irq_chip_data(irq))) { 1259 if (irq_remapped(cfg)) {
1261 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 1260 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1262 if (trigger) 1261 chip = &ir_ioapic_chip;
1263 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1262 fasteoi = trigger != 0;
1264 handle_fasteoi_irq,
1265 "fasteoi");
1266 else
1267 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1268 handle_edge_irq, "edge");
1269 return;
1270 } 1263 }
1271 1264
1272 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1265 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1273 trigger == IOAPIC_LEVEL) 1266 irq_set_chip_and_handler_name(irq, chip, hdl,
1274 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1267 fasteoi ? "fasteoi" : "edge");
1275 handle_fasteoi_irq,
1276 "fasteoi");
1277 else
1278 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1279 handle_edge_irq, "edge");
1280} 1268}
1281 1269
1282static int setup_ioapic_entry(int apic_id, int irq, 1270static int setup_ioapic_entry(int apic_id, int irq,
@@ -1374,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1374 return; 1362 return;
1375 } 1363 }
1376 1364
1377 ioapic_register_intr(irq, trigger); 1365 ioapic_register_intr(irq, cfg, trigger);
1378 if (irq < legacy_pic->nr_legacy_irqs) 1366 if (irq < legacy_pic->nr_legacy_irqs)
1379 legacy_pic->mask(irq); 1367 legacy_pic->mask(irq);
1380 1368
@@ -1385,33 +1373,26 @@ static struct {
1385 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1386} mp_ioapic_routing[MAX_IO_APICS]; 1374} mp_ioapic_routing[MAX_IO_APICS];
1387 1375
1388static void __init setup_IO_APIC_irqs(void) 1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1389{ 1377{
1390 int apic_id, pin, idx, irq, notcon = 0; 1378 if (idx != -1)
1391 int node = cpu_to_node(0); 1379 return false;
1392 struct irq_cfg *cfg;
1393 1380
1394 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin);
1383 return true;
1384}
1385
1386static void __init __io_apic_setup_irqs(unsigned int apic_id)
1387{
1388 int idx, node = cpu_to_node(0);
1389 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq;
1395 1391
1396 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1397 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1398 idx = find_irq_entry(apic_id, pin, mp_INT); 1393 idx = find_irq_entry(apic_id, pin, mp_INT);
1399 if (idx == -1) { 1394 if (io_apic_pin_not_connected(idx, apic_id, pin))
1400 if (!notcon) {
1401 notcon = 1;
1402 apic_printk(APIC_VERBOSE,
1403 KERN_DEBUG " %d-%d",
1404 mp_ioapics[apic_id].apicid, pin);
1405 } else
1406 apic_printk(APIC_VERBOSE, " %d-%d",
1407 mp_ioapics[apic_id].apicid, pin);
1408 continue; 1395 continue;
1409 }
1410 if (notcon) {
1411 apic_printk(APIC_VERBOSE,
1412 " (apicid-pin) not connected\n");
1413 notcon = 0;
1414 }
1415 1396
1416 irq = pin_2_irq(idx, apic_id, pin); 1397 irq = pin_2_irq(idx, apic_id, pin);
1417 1398
@@ -1423,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void)
1423 * installed and if it returns 1: 1404 * installed and if it returns 1:
1424 */ 1405 */
1425 if (apic->multi_timer_check && 1406 if (apic->multi_timer_check &&
1426 apic->multi_timer_check(apic_id, irq)) 1407 apic->multi_timer_check(apic_id, irq))
1427 continue; 1408 continue;
1428 1409
1429 cfg = alloc_irq_and_cfg_at(irq, node); 1410 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1430 if (!cfg) 1411 irq_polarity(idx));
1431 continue;
1432 1412
1433 add_pin_to_irq_node(cfg, node, apic_id, pin); 1413 io_apic_setup_irq_pin(irq, node, &attr);
1434 /*
1435 * don't mark it in pin_programmed, so later acpi could
1436 * set it correctly when irq < 16
1437 */
1438 setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
1439 irq_polarity(idx));
1440 } 1414 }
1415}
1441 1416
1442 if (notcon) 1417static void __init setup_IO_APIC_irqs(void)
1443 apic_printk(APIC_VERBOSE, 1418{
1444 " (apicid-pin) not connected\n"); 1419 unsigned int apic_id;
1420
1421 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1422
1423 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1424 __io_apic_setup_irqs(apic_id);
1445} 1425}
1446 1426
1447/* 1427/*
@@ -1452,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void)
1452void setup_IO_APIC_irq_extra(u32 gsi) 1432void setup_IO_APIC_irq_extra(u32 gsi)
1453{ 1433{
1454 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); 1434 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1455 struct irq_cfg *cfg; 1435 struct io_apic_irq_attr attr;
1456 1436
1457 /* 1437 /*
1458 * Convert 'gsi' to 'ioapic.pin'. 1438 * Convert 'gsi' to 'ioapic.pin'.
@@ -1472,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1472 if (apic_id == 0 || irq < NR_IRQS_LEGACY) 1452 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1473 return; 1453 return;
1474 1454
1475 cfg = alloc_irq_and_cfg_at(irq, node); 1455 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1476 if (!cfg) 1456 irq_polarity(idx));
1477 return;
1478
1479 add_pin_to_irq_node(cfg, node, apic_id, pin);
1480
1481 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1482 pr_debug("Pin %d-%d already programmed\n",
1483 mp_ioapics[apic_id].apicid, pin);
1484 return;
1485 }
1486 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1487 1457
1488 setup_ioapic_irq(apic_id, pin, irq, cfg, 1458 io_apic_setup_irq_pin_once(irq, node, &attr);
1489 irq_trigger(idx), irq_polarity(idx));
1490} 1459}
1491 1460
1492/* 1461/*
@@ -1518,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1518 * The timer IRQ doesn't have to know that behind the 1487 * The timer IRQ doesn't have to know that behind the
1519 * scene we may have a 8259A-master in AEOI mode ... 1488 * scene we may have a 8259A-master in AEOI mode ...
1520 */ 1489 */
1521 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1490 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1491 "edge");
1522 1492
1523 /* 1493 /*
1524 * Add it to the IO-APIC irq-routing table: 1494 * Add it to the IO-APIC irq-routing table:
@@ -1625,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1625 for_each_active_irq(irq) { 1595 for_each_active_irq(irq) {
1626 struct irq_pin_list *entry; 1596 struct irq_pin_list *entry;
1627 1597
1628 cfg = get_irq_chip_data(irq); 1598 cfg = irq_get_chip_data(irq);
1629 if (!cfg) 1599 if (!cfg)
1630 continue; 1600 continue;
1631 entry = cfg->irq_2_pin; 1601 entry = cfg->irq_2_pin;
@@ -1916,7 +1886,7 @@ void disable_IO_APIC(void)
1916 * 1886 *
1917 * With interrupt-remapping, for now we will use virtual wire A mode, 1887 * With interrupt-remapping, for now we will use virtual wire A mode,
1918 * as virtual wire B is little complex (need to configure both 1888 * as virtual wire B is little complex (need to configure both
1919 * IOAPIC RTE aswell as interrupt-remapping table entry). 1889 * IOAPIC RTE as well as interrupt-remapping table entry).
1920 * As this gets called during crash dump, keep this simple for now. 1890 * As this gets called during crash dump, keep this simple for now.
1921 */ 1891 */
1922 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1892 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2391,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
2391 2361
2392void irq_force_complete_move(int irq) 2362void irq_force_complete_move(int irq)
2393{ 2363{
2394 struct irq_cfg *cfg = get_irq_chip_data(irq); 2364 struct irq_cfg *cfg = irq_get_chip_data(irq);
2395 2365
2396 if (!cfg) 2366 if (!cfg)
2397 return; 2367 return;
@@ -2405,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { }
2405static void ack_apic_edge(struct irq_data *data) 2375static void ack_apic_edge(struct irq_data *data)
2406{ 2376{
2407 irq_complete_move(data->chip_data); 2377 irq_complete_move(data->chip_data);
2408 move_native_irq(data->irq); 2378 irq_move_irq(data);
2409 ack_APIC_irq(); 2379 ack_APIC_irq();
2410} 2380}
2411 2381
@@ -2462,7 +2432,7 @@ static void ack_apic_level(struct irq_data *data)
2462 irq_complete_move(cfg); 2432 irq_complete_move(cfg);
2463#ifdef CONFIG_GENERIC_PENDING_IRQ 2433#ifdef CONFIG_GENERIC_PENDING_IRQ
2464 /* If we are moving the irq we need to mask it */ 2434 /* If we are moving the irq we need to mask it */
2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2435 if (unlikely(irqd_is_setaffinity_pending(data))) {
2466 do_unmask_irq = 1; 2436 do_unmask_irq = 1;
2467 mask_ioapic(cfg); 2437 mask_ioapic(cfg);
2468 } 2438 }
@@ -2551,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data)
2551 * and you can go talk to the chipset vendor about it. 2521 * and you can go talk to the chipset vendor about it.
2552 */ 2522 */
2553 if (!io_apic_level_ack_pending(cfg)) 2523 if (!io_apic_level_ack_pending(cfg))
2554 move_masked_irq(irq); 2524 irq_move_masked_irq(data);
2555 unmask_ioapic(cfg); 2525 unmask_ioapic(cfg);
2556 } 2526 }
2557} 2527}
@@ -2614,7 +2584,7 @@ static inline void init_IO_APIC_traps(void)
2614 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2584 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2615 */ 2585 */
2616 for_each_active_irq(irq) { 2586 for_each_active_irq(irq) {
2617 cfg = get_irq_chip_data(irq); 2587 cfg = irq_get_chip_data(irq);
2618 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2588 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2619 /* 2589 /*
2620 * Hmm.. We don't have an entry for this, 2590 * Hmm.. We don't have an entry for this,
@@ -2625,7 +2595,7 @@ static inline void init_IO_APIC_traps(void)
2625 legacy_pic->make_irq(irq); 2595 legacy_pic->make_irq(irq);
2626 else 2596 else
2627 /* Strange. Oh, well.. */ 2597 /* Strange. Oh, well.. */
2628 set_irq_chip(irq, &no_irq_chip); 2598 irq_set_chip(irq, &no_irq_chip);
2629 } 2599 }
2630 } 2600 }
2631} 2601}
@@ -2665,7 +2635,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2665static void lapic_register_intr(int irq) 2635static void lapic_register_intr(int irq)
2666{ 2636{
2667 irq_clear_status_flags(irq, IRQ_LEVEL); 2637 irq_clear_status_flags(irq, IRQ_LEVEL);
2668 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2638 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2669 "edge"); 2639 "edge");
2670} 2640}
2671 2641
@@ -2749,7 +2719,7 @@ int timer_through_8259 __initdata;
2749 */ 2719 */
2750static inline void __init check_timer(void) 2720static inline void __init check_timer(void)
2751{ 2721{
2752 struct irq_cfg *cfg = get_irq_chip_data(0); 2722 struct irq_cfg *cfg = irq_get_chip_data(0);
2753 int node = cpu_to_node(0); 2723 int node = cpu_to_node(0);
2754 int apic1, pin1, apic2, pin2; 2724 int apic1, pin1, apic2, pin2;
2755 unsigned long flags; 2725 unsigned long flags;
@@ -2935,7 +2905,7 @@ void __init setup_IO_APIC(void)
2935} 2905}
2936 2906
2937/* 2907/*
2938 * Called after all the initialization is done. If we didnt find any 2908 * Called after all the initialization is done. If we didn't find any
2939 * APIC bugs then we can allow the modify fast path 2909 * APIC bugs then we can allow the modify fast path
2940 */ 2910 */
2941 2911
@@ -2948,89 +2918,84 @@ static int __init io_apic_bug_finalize(void)
2948 2918
2949late_initcall(io_apic_bug_finalize); 2919late_initcall(io_apic_bug_finalize);
2950 2920
2951struct sysfs_ioapic_data { 2921static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
2952 struct sys_device dev;
2953 struct IO_APIC_route_entry entry[0];
2954};
2955static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
2956 2922
2957static int ioapic_suspend(struct sys_device *dev, pm_message_t state) 2923static void suspend_ioapic(int ioapic_id)
2958{ 2924{
2959 struct IO_APIC_route_entry *entry; 2925 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2960 struct sysfs_ioapic_data *data;
2961 int i; 2926 int i;
2962 2927
2963 data = container_of(dev, struct sysfs_ioapic_data, dev); 2928 if (!saved_data)
2964 entry = data->entry; 2929 return;
2965 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) 2930
2966 *entry = ioapic_read_entry(dev->id, i); 2931 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2932 saved_data[i] = ioapic_read_entry(ioapic_id, i);
2933}
2934
2935static int ioapic_suspend(void)
2936{
2937 int ioapic_id;
2938
2939 for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
2940 suspend_ioapic(ioapic_id);
2967 2941
2968 return 0; 2942 return 0;
2969} 2943}
2970 2944
2971static int ioapic_resume(struct sys_device *dev) 2945static void resume_ioapic(int ioapic_id)
2972{ 2946{
2973 struct IO_APIC_route_entry *entry; 2947 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2974 struct sysfs_ioapic_data *data;
2975 unsigned long flags; 2948 unsigned long flags;
2976 union IO_APIC_reg_00 reg_00; 2949 union IO_APIC_reg_00 reg_00;
2977 int i; 2950 int i;
2978 2951
2979 data = container_of(dev, struct sysfs_ioapic_data, dev); 2952 if (!saved_data)
2980 entry = data->entry; 2953 return;
2981 2954
2982 raw_spin_lock_irqsave(&ioapic_lock, flags); 2955 raw_spin_lock_irqsave(&ioapic_lock, flags);
2983 reg_00.raw = io_apic_read(dev->id, 0); 2956 reg_00.raw = io_apic_read(ioapic_id, 0);
2984 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 2957 if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
2985 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 2958 reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
2986 io_apic_write(dev->id, 0, reg_00.raw); 2959 io_apic_write(ioapic_id, 0, reg_00.raw);
2987 } 2960 }
2988 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2961 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2989 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 2962 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2990 ioapic_write_entry(dev->id, i, entry[i]); 2963 ioapic_write_entry(ioapic_id, i, saved_data[i]);
2964}
2991 2965
2992 return 0; 2966static void ioapic_resume(void)
2967{
2968 int ioapic_id;
2969
2970 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2971 resume_ioapic(ioapic_id);
2993} 2972}
2994 2973
2995static struct sysdev_class ioapic_sysdev_class = { 2974static struct syscore_ops ioapic_syscore_ops = {
2996 .name = "ioapic",
2997 .suspend = ioapic_suspend, 2975 .suspend = ioapic_suspend,
2998 .resume = ioapic_resume, 2976 .resume = ioapic_resume,
2999}; 2977};
3000 2978
3001static int __init ioapic_init_sysfs(void) 2979static int __init ioapic_init_ops(void)
3002{ 2980{
3003 struct sys_device * dev; 2981 int i;
3004 int i, size, error;
3005 2982
3006 error = sysdev_class_register(&ioapic_sysdev_class); 2983 for (i = 0; i < nr_ioapics; i++) {
3007 if (error) 2984 unsigned int size;
3008 return error;
3009 2985
3010 for (i = 0; i < nr_ioapics; i++ ) { 2986 size = nr_ioapic_registers[i]
3011 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
3012 * sizeof(struct IO_APIC_route_entry); 2987 * sizeof(struct IO_APIC_route_entry);
3013 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); 2988 ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
3014 if (!mp_ioapic_data[i]) { 2989 if (!ioapic_saved_data[i])
3015 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); 2990 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
3016 continue;
3017 }
3018 dev = &mp_ioapic_data[i]->dev;
3019 dev->id = i;
3020 dev->cls = &ioapic_sysdev_class;
3021 error = sysdev_register(dev);
3022 if (error) {
3023 kfree(mp_ioapic_data[i]);
3024 mp_ioapic_data[i] = NULL;
3025 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3026 continue;
3027 }
3028 } 2991 }
3029 2992
2993 register_syscore_ops(&ioapic_syscore_ops);
2994
3030 return 0; 2995 return 0;
3031} 2996}
3032 2997
3033device_initcall(ioapic_init_sysfs); 2998device_initcall(ioapic_init_ops);
3034 2999
3035/* 3000/*
3036 * Dynamic irq allocate and deallocation 3001 * Dynamic irq allocate and deallocation
@@ -3060,7 +3025,7 @@ unsigned int create_irq_nr(unsigned int from, int node)
3060 raw_spin_unlock_irqrestore(&vector_lock, flags); 3025 raw_spin_unlock_irqrestore(&vector_lock, flags);
3061 3026
3062 if (ret) { 3027 if (ret) {
3063 set_irq_chip_data(irq, cfg); 3028 irq_set_chip_data(irq, cfg);
3064 irq_clear_status_flags(irq, IRQ_NOREQUEST); 3029 irq_clear_status_flags(irq, IRQ_NOREQUEST);
3065 } else { 3030 } else {
3066 free_irq_at(irq, cfg); 3031 free_irq_at(irq, cfg);
@@ -3085,7 +3050,7 @@ int create_irq(void)
3085 3050
3086void destroy_irq(unsigned int irq) 3051void destroy_irq(unsigned int irq)
3087{ 3052{
3088 struct irq_cfg *cfg = get_irq_chip_data(irq); 3053 struct irq_cfg *cfg = irq_get_chip_data(irq);
3089 unsigned long flags; 3054 unsigned long flags;
3090 3055
3091 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3056 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
@@ -3119,7 +3084,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3119 3084
3120 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3085 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3121 3086
3122 if (irq_remapped(get_irq_chip_data(irq))) { 3087 if (irq_remapped(cfg)) {
3123 struct irte irte; 3088 struct irte irte;
3124 int ir_index; 3089 int ir_index;
3125 u16 sub_handle; 3090 u16 sub_handle;
@@ -3291,6 +3256,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3291 3256
3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3257static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3293{ 3258{
3259 struct irq_chip *chip = &msi_chip;
3294 struct msi_msg msg; 3260 struct msi_msg msg;
3295 int ret; 3261 int ret;
3296 3262
@@ -3298,14 +3264,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3298 if (ret < 0) 3264 if (ret < 0)
3299 return ret; 3265 return ret;
3300 3266
3301 set_irq_msi(irq, msidesc); 3267 irq_set_msi_desc(irq, msidesc);
3302 write_msi_msg(irq, &msg); 3268 write_msi_msg(irq, &msg);
3303 3269
3304 if (irq_remapped(get_irq_chip_data(irq))) { 3270 if (irq_remapped(irq_get_chip_data(irq))) {
3305 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3271 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3306 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3272 chip = &msi_ir_chip;
3307 } else 3273 }
3308 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3274
3275 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3309 3276
3310 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3277 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3311 3278
@@ -3423,8 +3390,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3423 if (ret < 0) 3390 if (ret < 0)
3424 return ret; 3391 return ret;
3425 dmar_msi_write(irq, &msg); 3392 dmar_msi_write(irq, &msg);
3426 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3393 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3427 "edge"); 3394 "edge");
3428 return 0; 3395 return 0;
3429} 3396}
3430#endif 3397#endif
@@ -3482,6 +3449,7 @@ static struct irq_chip hpet_msi_type = {
3482 3449
3483int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3450int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3484{ 3451{
3452 struct irq_chip *chip = &hpet_msi_type;
3485 struct msi_msg msg; 3453 struct msi_msg msg;
3486 int ret; 3454 int ret;
3487 3455
@@ -3501,15 +3469,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3501 if (ret < 0) 3469 if (ret < 0)
3502 return ret; 3470 return ret;
3503 3471
3504 hpet_msi_write(get_irq_data(irq), &msg); 3472 hpet_msi_write(irq_get_handler_data(irq), &msg);
3505 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3473 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3506 if (irq_remapped(get_irq_chip_data(irq))) 3474 if (irq_remapped(irq_get_chip_data(irq)))
3507 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3475 chip = &ir_hpet_msi_type;
3508 handle_edge_irq, "edge");
3509 else
3510 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3511 handle_edge_irq, "edge");
3512 3476
3477 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3513 return 0; 3478 return 0;
3514} 3479}
3515#endif 3480#endif
@@ -3596,7 +3561,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3596 3561
3597 write_ht_irq_msg(irq, &msg); 3562 write_ht_irq_msg(irq, &msg);
3598 3563
3599 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3564 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3600 handle_edge_irq, "edge"); 3565 handle_edge_irq, "edge");
3601 3566
3602 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3567 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3605,7 +3570,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3605} 3570}
3606#endif /* CONFIG_HT_IRQ */ 3571#endif /* CONFIG_HT_IRQ */
3607 3572
3608int __init io_apic_get_redir_entries (int ioapic) 3573static int
3574io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3575{
3576 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3577 int ret;
3578
3579 if (!cfg)
3580 return -EINVAL;
3581 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3582 if (!ret)
3583 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3584 attr->trigger, attr->polarity);
3585 return ret;
3586}
3587
3588int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3589 struct io_apic_irq_attr *attr)
3590{
3591 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3592 int ret;
3593
3594 /* Avoid redundant programming */
3595 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
3596 pr_debug("Pin %d-%d already programmed\n",
3597 mp_ioapics[id].apicid, pin);
3598 return 0;
3599 }
3600 ret = io_apic_setup_irq_pin(irq, node, attr);
3601 if (!ret)
3602 set_bit(pin, mp_ioapic_routing[id].pin_programmed);
3603 return ret;
3604}
3605
3606static int __init io_apic_get_redir_entries(int ioapic)
3609{ 3607{
3610 union IO_APIC_reg_01 reg_01; 3608 union IO_APIC_reg_01 reg_01;
3611 unsigned long flags; 3609 unsigned long flags;
@@ -3659,96 +3657,24 @@ int __init arch_probe_nr_irqs(void)
3659} 3657}
3660#endif 3658#endif
3661 3659
3662static int __io_apic_set_pci_routing(struct device *dev, int irq, 3660int io_apic_set_pci_routing(struct device *dev, int irq,
3663 struct io_apic_irq_attr *irq_attr) 3661 struct io_apic_irq_attr *irq_attr)
3664{ 3662{
3665 struct irq_cfg *cfg;
3666 int node; 3663 int node;
3667 int ioapic, pin;
3668 int trigger, polarity;
3669 3664
3670 ioapic = irq_attr->ioapic;
3671 if (!IO_APIC_IRQ(irq)) { 3665 if (!IO_APIC_IRQ(irq)) {
3672 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3666 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3673 ioapic); 3667 irq_attr->ioapic);
3674 return -EINVAL; 3668 return -EINVAL;
3675 } 3669 }
3676 3670
3677 if (dev) 3671 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3678 node = dev_to_node(dev);
3679 else
3680 node = cpu_to_node(0);
3681
3682 cfg = alloc_irq_and_cfg_at(irq, node);
3683 if (!cfg)
3684 return 0;
3685
3686 pin = irq_attr->ioapic_pin;
3687 trigger = irq_attr->trigger;
3688 polarity = irq_attr->polarity;
3689
3690 /*
3691 * IRQs < 16 are already in the irq_2_pin[] map
3692 */
3693 if (irq >= legacy_pic->nr_legacy_irqs) {
3694 if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
3695 printk(KERN_INFO "can not add pin %d for irq %d\n",
3696 pin, irq);
3697 return 0;
3698 }
3699 }
3700
3701 setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
3702 3672
3703 return 0; 3673 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3704} 3674}
3705 3675
3706int io_apic_set_pci_routing(struct device *dev, int irq,
3707 struct io_apic_irq_attr *irq_attr)
3708{
3709 int ioapic, pin;
3710 /*
3711 * Avoid pin reprogramming. PRTs typically include entries
3712 * with redundant pin->gsi mappings (but unique PCI devices);
3713 * we only program the IOAPIC on the first.
3714 */
3715 ioapic = irq_attr->ioapic;
3716 pin = irq_attr->ioapic_pin;
3717 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3718 pr_debug("Pin %d-%d already programmed\n",
3719 mp_ioapics[ioapic].apicid, pin);
3720 return 0;
3721 }
3722 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3723
3724 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3725}
3726
3727u8 __init io_apic_unique_id(u8 id)
3728{
3729#ifdef CONFIG_X86_32 3676#ifdef CONFIG_X86_32
3730 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 3677static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3731 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3732 return io_apic_get_unique_id(nr_ioapics, id);
3733 else
3734 return id;
3735#else
3736 int i;
3737 DECLARE_BITMAP(used, 256);
3738
3739 bitmap_zero(used, 256);
3740 for (i = 0; i < nr_ioapics; i++) {
3741 struct mpc_ioapic *ia = &mp_ioapics[i];
3742 __set_bit(ia->apicid, used);
3743 }
3744 if (!test_bit(id, used))
3745 return id;
3746 return find_first_zero_bit(used, 256);
3747#endif
3748}
3749
3750#ifdef CONFIG_X86_32
3751int __init io_apic_get_unique_id(int ioapic, int apic_id)
3752{ 3678{
3753 union IO_APIC_reg_00 reg_00; 3679 union IO_APIC_reg_00 reg_00;
3754 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3680 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3821,9 +3747,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3821 3747
3822 return apic_id; 3748 return apic_id;
3823} 3749}
3750
3751static u8 __init io_apic_unique_id(u8 id)
3752{
3753 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3754 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3755 return io_apic_get_unique_id(nr_ioapics, id);
3756 else
3757 return id;
3758}
3759#else
3760static u8 __init io_apic_unique_id(u8 id)
3761{
3762 int i;
3763 DECLARE_BITMAP(used, 256);
3764
3765 bitmap_zero(used, 256);
3766 for (i = 0; i < nr_ioapics; i++) {
3767 struct mpc_ioapic *ia = &mp_ioapics[i];
3768 __set_bit(ia->apicid, used);
3769 }
3770 if (!test_bit(id, used))
3771 return id;
3772 return find_first_zero_bit(used, 256);
3773}
3824#endif 3774#endif
3825 3775
3826int __init io_apic_get_version(int ioapic) 3776static int __init io_apic_get_version(int ioapic)
3827{ 3777{
3828 union IO_APIC_reg_01 reg_01; 3778 union IO_APIC_reg_01 reg_01;
3829 unsigned long flags; 3779 unsigned long flags;
@@ -3868,8 +3818,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
3868void __init setup_ioapic_dest(void) 3818void __init setup_ioapic_dest(void)
3869{ 3819{
3870 int pin, ioapic, irq, irq_entry; 3820 int pin, ioapic, irq, irq_entry;
3871 struct irq_desc *desc;
3872 const struct cpumask *mask; 3821 const struct cpumask *mask;
3822 struct irq_data *idata;
3873 3823
3874 if (skip_ioapic_setup == 1) 3824 if (skip_ioapic_setup == 1)
3875 return; 3825 return;
@@ -3884,21 +3834,20 @@ void __init setup_ioapic_dest(void)
3884 if ((ioapic > 0) && (irq > 16)) 3834 if ((ioapic > 0) && (irq > 16))
3885 continue; 3835 continue;
3886 3836
3887 desc = irq_to_desc(irq); 3837 idata = irq_get_irq_data(irq);
3888 3838
3889 /* 3839 /*
3890 * Honour affinities which have been set in early boot 3840 * Honour affinities which have been set in early boot
3891 */ 3841 */
3892 if (desc->status & 3842 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
3893 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3843 mask = idata->affinity;
3894 mask = desc->irq_data.affinity;
3895 else 3844 else
3896 mask = apic->target_cpus(); 3845 mask = apic->target_cpus();
3897 3846
3898 if (intr_remapping_enabled) 3847 if (intr_remapping_enabled)
3899 ir_ioapic_set_affinity(&desc->irq_data, mask, false); 3848 ir_ioapic_set_affinity(idata, mask, false);
3900 else 3849 else
3901 ioapic_set_affinity(&desc->irq_data, mask, false); 3850 ioapic_set_affinity(idata, mask, false);
3902 } 3851 }
3903 3852
3904} 3853}
@@ -4026,10 +3975,10 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
4026 return gsi - mp_gsi_routing[ioapic].gsi_base; 3975 return gsi - mp_gsi_routing[ioapic].gsi_base;
4027} 3976}
4028 3977
4029static int bad_ioapic(unsigned long address) 3978static __init int bad_ioapic(unsigned long address)
4030{ 3979{
4031 if (nr_ioapics >= MAX_IO_APICS) { 3980 if (nr_ioapics >= MAX_IO_APICS) {
4032 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3981 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
4033 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3982 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4034 return 1; 3983 return 1;
4035 } 3984 }
@@ -4086,20 +4035,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4086/* Enable IOAPIC early just for system timer */ 4035/* Enable IOAPIC early just for system timer */
4087void __init pre_init_apic_IRQ0(void) 4036void __init pre_init_apic_IRQ0(void)
4088{ 4037{
4089 struct irq_cfg *cfg; 4038 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4090 4039
4091 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4040 printk(KERN_INFO "Early APIC setup for system timer0\n");
4092#ifndef CONFIG_SMP 4041#ifndef CONFIG_SMP
4093 physid_set_mask_of_physid(boot_cpu_physical_apicid, 4042 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4094 &phys_cpu_present_map); 4043 &phys_cpu_present_map);
4095#endif 4044#endif
4096 /* Make sure the irq descriptor is set up */
4097 cfg = alloc_irq_and_cfg_at(0, 0);
4098
4099 setup_local_APIC(); 4045 setup_local_APIC();
4100 4046
4101 add_pin_to_irq_node(cfg, 0, 0, 0); 4047 io_apic_setup_irq_pin(0, 0, &attr);
4102 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4048 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4103 4049 "edge");
4104 setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
4105} 4050}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 960f26ab5c9f..6273eee5134b 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
373 return physids_promote(0xFUL, retmap); 373 return physids_promote(0xFUL, retmap);
374} 374}
375 375
376static inline int numaq_cpu_to_logical_apicid(int cpu)
377{
378 if (cpu >= nr_cpu_ids)
379 return BAD_APICID;
380 return cpu_2_logical_apicid[cpu];
381}
382
383/* 376/*
384 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 377 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
385 * cpu to APIC ID relation to properly interact with the intelligent 378 * cpu to APIC ID relation to properly interact with the intelligent
@@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
398 return logical_apicid >> 4; 391 return logical_apicid >> 4;
399} 392}
400 393
394static int numaq_numa_cpu_node(int cpu)
395{
396 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
397
398 if (logical_apicid != BAD_APICID)
399 return numaq_apicid_to_node(logical_apicid);
400 return NUMA_NO_NODE;
401}
402
401static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 403static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
402{ 404{
403 int node = numaq_apicid_to_node(logical_apicid); 405 int node = numaq_apicid_to_node(logical_apicid);
@@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = {
508 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 510 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
509 .setup_apic_routing = numaq_setup_apic_routing, 511 .setup_apic_routing = numaq_setup_apic_routing,
510 .multi_timer_check = numaq_multi_timer_check, 512 .multi_timer_check = numaq_multi_timer_check,
511 .apicid_to_node = numaq_apicid_to_node,
512 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
515 .setup_portio_remap = numaq_setup_portio_remap, 515 .setup_portio_remap = numaq_setup_portio_remap,
@@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = {
547 .icr_write = native_apic_icr_write, 547 .icr_write = native_apic_icr_write,
548 .wait_icr_idle = native_apic_wait_icr_idle, 548 .wait_icr_idle = native_apic_wait_icr_idle,
549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
550
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
550}; 553};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..fc84c7b61108 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void)
77 apic->setup_apic_routing(); 77 apic->setup_apic_routing();
78} 78}
79 79
80static int default_x86_32_early_logical_apicid(int cpu)
81{
82 return 1 << cpu;
83}
84
80static void setup_apic_flat_routing(void) 85static void setup_apic_flat_routing(void)
81{ 86{
82#ifdef CONFIG_X86_IO_APIC 87#ifdef CONFIG_X86_IO_APIC
@@ -130,8 +135,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 135 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 136 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 137 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 138 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 139 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 140 .setup_portio_remap = NULL,
@@ -167,6 +170,9 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 170 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 171 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
170}; 176};
171 177
172extern struct apic apic_numaq; 178extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..e4b8059b414a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,7 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
568}; 555};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..90949bbd566d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 206 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 207 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 208 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 209 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 210 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 211 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..c7e6d6645bf4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 195 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 196 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 197 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 198 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 199 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 200 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58b8850..33b10a0fc095 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -23,6 +23,8 @@
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <linux/delay.h>
27#include <linux/crash_dump.h>
26 28
27#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
28#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
@@ -34,6 +36,7 @@
34#include <asm/ipi.h> 36#include <asm/ipi.h>
35#include <asm/smp.h> 37#include <asm/smp.h>
36#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h>
37 40
38DEFINE_PER_CPU(int, x2apic_extra_bits); 41DEFINE_PER_CPU(int, x2apic_extra_bits);
39 42
@@ -338,8 +341,6 @@ struct apic __refdata apic_x2apic_uv_x = {
338 .ioapic_phys_id_map = NULL, 341 .ioapic_phys_id_map = NULL,
339 .setup_apic_routing = NULL, 342 .setup_apic_routing = NULL,
340 .multi_timer_check = NULL, 343 .multi_timer_check = NULL,
341 .apicid_to_node = NULL,
342 .cpu_to_logical_apicid = NULL,
343 .cpu_present_to_apicid = default_cpu_present_to_apicid, 344 .cpu_present_to_apicid = default_cpu_present_to_apicid,
344 .apicid_to_cpu_present = NULL, 345 .apicid_to_cpu_present = NULL,
345 .setup_portio_remap = NULL, 346 .setup_portio_remap = NULL,
@@ -812,4 +813,11 @@ void __init uv_system_init(void)
812 813
813 /* register Legacy VGA I/O redirection handler */ 814 /* register Legacy VGA I/O redirection handler */
814 pci_register_set_vga_state(uv_set_vga_state); 815 pci_register_set_vga_state(uv_set_vga_state);
816
817 /*
818 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
819 * EFI is not enabled in the kdump kernel.
820 */
821 if (is_kdump_kernel())
822 reboot_type = BOOT_ACPI;
815} 823}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a746..adee12e0da1f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -227,6 +227,8 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
231#include <linux/syscore_ops.h>
230 232
231#include <asm/system.h> 233#include <asm/system.h>
232#include <asm/uaccess.h> 234#include <asm/uaccess.h>
@@ -975,20 +977,10 @@ recalc:
975 977
976static void apm_power_off(void) 978static void apm_power_off(void)
977{ 979{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 980 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 981 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 982 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 983 machine_real_restart(MRR_APM);
992 } else { 984 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 985 (void)set_system_power_state(APM_STATE_OFF);
994 } 986 }
@@ -1247,6 +1239,7 @@ static int suspend(int vetoable)
1247 1239
1248 local_irq_disable(); 1240 local_irq_disable();
1249 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
1242 syscore_suspend();
1250 1243
1251 local_irq_enable(); 1244 local_irq_enable();
1252 1245
@@ -1264,6 +1257,7 @@ static int suspend(int vetoable)
1264 apm_error("suspend", err); 1257 apm_error("suspend", err);
1265 err = (err == APM_SUCCESS) ? 0 : -EIO; 1258 err = (err == APM_SUCCESS) ? 0 : -EIO;
1266 1259
1260 syscore_resume();
1267 sysdev_resume(); 1261 sysdev_resume();
1268 local_irq_enable(); 1262 local_irq_enable();
1269 1263
@@ -1289,6 +1283,7 @@ static void standby(void)
1289 1283
1290 local_irq_disable(); 1284 local_irq_disable();
1291 sysdev_suspend(PMSG_SUSPEND); 1285 sysdev_suspend(PMSG_SUSPEND);
1286 syscore_suspend();
1292 local_irq_enable(); 1287 local_irq_enable();
1293 1288
1294 err = set_system_power_state(APM_STATE_STANDBY); 1289 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1296,6 +1291,7 @@ static void standby(void)
1296 apm_error("standby", err); 1291 apm_error("standby", err);
1297 1292
1298 local_irq_disable(); 1293 local_irq_disable();
1294 syscore_resume();
1299 sysdev_resume(); 1295 sysdev_resume();
1300 local_irq_enable(); 1296 local_irq_enable();
1301 1297
@@ -2331,12 +2327,11 @@ static int __init apm_init(void)
2331 apm_info.disabled = 1; 2327 apm_info.disabled = 1;
2332 return -ENODEV; 2328 return -ENODEV;
2333 } 2329 }
2334 if (pm_flags & PM_ACPI) { 2330 if (!acpi_disabled) {
2335 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2331 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2336 apm_info.disabled = 1; 2332 apm_info.disabled = 1;
2337 return -ENODEV; 2333 return -ENODEV;
2338 } 2334 }
2339 pm_flags |= PM_APM;
2340 2335
2341 /* 2336 /*
2342 * Set up the long jump entry point to the APM BIOS, which is called 2337 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2428,7 +2423,6 @@ static void __exit apm_exit(void)
2428 kthread_stop(kapmd_task); 2423 kthread_stop(kapmd_task);
2429 kapmd_task = NULL; 2424 kapmd_task = NULL;
2430 } 2425 }
2431 pm_flags &= ~PM_APM;
2432} 2426}
2433 2427
2434module_init(apm_init); 2428module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37a..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,42 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
103
104 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
105
106#ifdef CONFIG_PARAVIRT
107 BLANK();
108 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
109 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
110 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif
117
118#ifdef CONFIG_XEN
119 BLANK();
120 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
121 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
122#endif
123
124#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
125 BLANK(); 63 BLANK();
126 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -139,11 +77,4 @@ void foo(void)
139 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
140 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
141#endif 79#endif
142
143 BLANK();
144 OFFSET(BP_scratch, boot_params, scratch);
145 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
146 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
147 OFFSET(BP_version, boot_params, hdr.version);
148 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 13a389179514..452932d34730 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -106,8 +106,8 @@ void __init setup_bios_corruption_check(void)
106 addr += size; 106 addr += size;
107 } 107 }
108 108
109 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", 109 if (num_scan_areas)
110 num_scan_areas); 110 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
111} 111}
112 112
113 113
@@ -143,12 +143,12 @@ static void check_corruption(struct work_struct *dummy)
143{ 143{
144 check_for_bios_corruption(); 144 check_for_bios_corruption();
145 schedule_delayed_work(&bios_check_work, 145 schedule_delayed_work(&bios_check_work,
146 round_jiffies_relative(corruption_check_period*HZ)); 146 round_jiffies_relative(corruption_check_period*HZ));
147} 147}
148 148
149static int start_periodic_check_for_corruption(void) 149static int start_periodic_check_for_corruption(void)
150{ 150{
151 if (!memory_corruption_check || corruption_check_period == 0) 151 if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
152 return 0; 152 return 0;
153 153
154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", 154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c7bedb83c5a..bb9eb29a52dd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid)
261#ifdef CONFIG_X86_HT 265#ifdef CONFIG_X86_HT
262static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) 266static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
263{ 267{
264 u32 nodes; 268 u32 nodes, cores_per_cu = 1;
265 u8 node_id; 269 u8 node_id;
266 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
267 271
@@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
276 /* get compute unit information */ 280 /* get compute unit information */
277 smp_num_siblings = ((ebx >> 8) & 3) + 1; 281 smp_num_siblings = ((ebx >> 8) & 3) + 1;
278 c->compute_unit_id = ebx & 0xff; 282 c->compute_unit_id = ebx & 0xff;
283 cores_per_cu += ((ebx >> 8) & 3);
279 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { 284 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
280 u64 value; 285 u64 value;
281 286
@@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
288 /* fixup multi-node processor information */ 293 /* fixup multi-node processor information */
289 if (nodes > 1) { 294 if (nodes > 1) {
290 u32 cores_per_node; 295 u32 cores_per_node;
296 u32 cus_per_node;
291 297
292 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 298 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
293 cores_per_node = c->x86_max_cores / nodes; 299 cores_per_node = c->x86_max_cores / nodes;
300 cus_per_node = cores_per_node / cores_per_cu;
294 301
295 /* store NodeID, use llc_shared_map to store sibling info */ 302 /* store NodeID, use llc_shared_map to store sibling info */
296 per_cpu(cpu_llc_id, cpu) = node_id; 303 per_cpu(cpu_llc_id, cpu) = node_id;
297 304
298 /* core id to be in range from 0 to (cores_per_node - 1) */ 305 /* core id has to be in the [0 .. cores_per_node - 1] range */
299 c->cpu_core_id = c->cpu_core_id % cores_per_node; 306 c->cpu_core_id %= cores_per_node;
307 c->compute_unit_id %= cus_per_node;
300 } 308 }
301} 309}
302#endif 310#endif
@@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
334 342
335static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
336{ 344{
337#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
338 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
339 int node; 347 int node;
340 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
341 349
342 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
343 353
344 if (apicid_to_node[apicid] != NUMA_NO_NODE)
345 node = apicid_to_node[apicid];
346 if (!node_online(node)) { 354 if (!node_online(node)) {
347 /* Two possibilities here: 355 /*
348 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
349 In that case try picking one from a nearby CPU 357 *
350 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
351 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
352 Assume they are all increased by a constant offset, 360 *
353 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
354 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
355 path for the previous case. */ 363 * they are all increased by a constant offset, but in
356 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
357 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
358 375
359 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
360 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
361 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
362 /* Pick a nearby node */ 379 /* Pick a nearby node */
363 if (!node_online(node)) 380 if (!node_online(node))
364 node = nearby_node(apicid); 381 node = nearby_node(apicid);
@@ -594,6 +611,29 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
594 } 611 }
595 } 612 }
596#endif 613#endif
614
615 /* As a rule processors have APIC timer running in deep C states */
616 if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
617 set_cpu_cap(c, X86_FEATURE_ARAT);
618
619 /*
620 * Disable GART TLB Walk Errors on Fam10h. We do this here
621 * because this is always needed when GART is enabled, even in a
622 * kernel which has no MCE support built in.
623 */
624 if (c->x86 == 0x10) {
625 /*
626 * BIOS should disable GartTlbWlk Errors themself. If
627 * it doesn't do it here as suggested by the BKDG.
628 *
629 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
630 */
631 u64 mask;
632
633 rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
634 mask |= (1 << 10);
635 wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
636 }
597} 637}
598 638
599#ifdef CONFIG_X86_32 639#ifdef CONFIG_X86_32
@@ -658,7 +698,7 @@ cpu_dev_register(amd_cpu_dev);
658 */ 698 */
659 699
660const int amd_erratum_400[] = 700const int amd_erratum_400[] =
661 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), 701 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf),
662 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); 702 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
663EXPORT_SYMBOL_GPL(amd_erratum_400); 703EXPORT_SYMBOL_GPL(amd_erratum_400);
664 704
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396bd..e2ced0074a45 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -675,7 +675,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 675 const struct cpu_dev *const *cdev;
676 int count = 0; 676 int count = 0;
677 677
678#ifdef PROCESSOR_SELECT 678#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 679 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 680#endif
681 681
@@ -687,7 +687,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 687 cpu_devs[count] = cpudev;
688 count++; 688 count++;
689 689
690#ifdef PROCESSOR_SELECT 690#ifdef CONFIG_PROCESSOR_SELECT
691 { 691 {
692 unsigned int j; 692 unsigned int j;
693 693
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
869 869
870 select_idle_routine(c); 870 select_idle_routine(c);
871 871
872#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 872#ifdef CONFIG_NUMA
873 numa_add_cpu(smp_processor_id()); 873 numa_add_cpu(smp_processor_id());
874#endif 874#endif
875} 875}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 03162dac6271..cf48cdd6907d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -444,7 +444,7 @@ static int __cpuinit longhaul_get_ranges(void)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* Get max multiplier - as we always did. 446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled. 447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */ 448 * C3 is booting at max anyway. */
449 maxmult = mult; 449 maxmult = mult;
450 /* Get min multiplier */ 450 /* Get min multiplier */
@@ -1011,7 +1011,7 @@ static void __exit longhaul_exit(void)
1011 * trigger frequency transition in some cases. */ 1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644); 1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save 1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */ 1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644); 1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index bd1cac747f67..52c93648e492 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -158,9 +158,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{ 158{
159 if (c->x86 == 0x06) { 159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST)) 160 if (cpu_has(c, X86_FEATURE_EST))
161 printk(KERN_WARNING PFX "Warning: EST-capable CPU " 161 printk_once(KERN_WARNING PFX "Warning: EST-capable "
162 "detected. The acpi-cpufreq module offers " 162 "CPU detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition of frequency " 163 "voltage scaling in addition to frequency "
164 "scaling. You should use that instead of " 164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n"); 165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) { 166 switch (c->x86_model) {
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 4f6f679f2799..755a31e0f5b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -195,7 +195,7 @@ static unsigned int pcc_get_freq(unsigned int cpu)
195cmd_incomplete: 195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status); 196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock); 197 spin_unlock(&pcc_lock);
198 return -EINVAL; 198 return 0;
199} 199}
200 200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy, 201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
@@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
315 315
316 input.count = 4; 316 input.count = 4;
317 input.pointer = in_params; 317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER; 318 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16; 319 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID; 320 in_params[0].buffer.pointer = OSC_UUID;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 35c7e65e59be..2368e38327b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data)
630 data->powernow_table[j].frequency/1000); 630 data->powernow_table[j].frequency/1000);
631 } else { 631 } else {
632 printk(KERN_INFO PFX 632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n", 633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff, 634 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000, 635 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8); 636 data->powernow_table[j].index >> 8);
@@ -1276,7 +1275,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1276 1275
1277 if (powernow_k8_cpu_init_acpi(data)) { 1276 if (powernow_k8_cpu_init_acpi(data)) {
1278 /* 1277 /*
1279 * Use the PSB BIOS structure. This is only availabe on 1278 * Use the PSB BIOS structure. This is only available on
1280 * an UP version, and is deprecated by AMD. 1279 * an UP version, and is deprecated by AMD.
1281 */ 1280 */
1282 if (num_online_cpus() != 1) { 1281 if (num_online_cpus() != 1) {
@@ -1537,6 +1536,7 @@ static struct notifier_block cpb_nb = {
1537static int __cpuinit powernowk8_init(void) 1536static int __cpuinit powernowk8_init(void)
1538{ 1537{
1539 unsigned int i, supported_cpus = 0, cpu; 1538 unsigned int i, supported_cpus = 0, cpu;
1539 int rv;
1540 1540
1541 for_each_online_cpu(i) { 1541 for_each_online_cpu(i) {
1542 int rc; 1542 int rc;
@@ -1555,14 +1555,14 @@ static int __cpuinit powernowk8_init(void)
1555 1555
1556 cpb_capable = true; 1556 cpb_capable = true;
1557 1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc(); 1558 msrs = msrs_alloc();
1561 if (!msrs) { 1559 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); 1560 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM; 1561 return -ENOMEM;
1564 } 1562 }
1565 1563
1564 register_cpu_notifier(&cpb_nb);
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); 1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567 1567
1568 for_each_cpu(cpu, cpu_online_mask) { 1568 for_each_cpu(cpu, cpu_online_mask) {
@@ -1574,7 +1574,13 @@ static int __cpuinit powernowk8_init(void)
1574 (cpb_enabled ? "on" : "off")); 1574 (cpb_enabled ? "on" : "off"));
1575 } 1575 }
1576 1576
1577 return cpufreq_register_driver(&cpufreq_amd64_driver); 1577 rv = cpufreq_register_driver(&cpufreq_amd64_driver);
1578 if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
1579 unregister_cpu_notifier(&cpb_nb);
1580 msrs_free(msrs);
1581 msrs = NULL;
1582 }
1583 return rv;
1578} 1584}
1579 1585
1580/* driver entry point for term */ 1586/* driver entry point for term */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8abd869baabf..91bc25b67bc1 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 result = speedstep_smi_ownership(); 293 result = speedstep_smi_ownership();
294 if (result) { 294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n"); 295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL; 296 return -EINVAL;
297 } 297 }
298 298
@@ -360,7 +360,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
360 int result = speedstep_smi_ownership(); 360 int result = speedstep_smi_ownership();
361 361
362 if (result) 362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n"); 363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364 364
365 return result; 365 return result;
366} 366}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d16c2c53d6bf..df86bc8c859d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 276
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 278{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 279#ifdef CONFIG_NUMA
280 unsigned node; 280 unsigned node;
281 int cpu = smp_processor_id(); 281 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 282
284 /* Don't do the funky fallback heuristics the AMD version employs 283 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 284 for now. */
286 node = apicid_to_node[apicid]; 285 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE || !node_online(node)) { 286 if (node == NUMA_NO_NODE || !node_online(node)) {
288 /* reuse the value from init_cpu_to_node() */ 287 /* reuse the value from init_cpu_to_node() */
289 node = cpu_to_node(cpu); 288 node = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ec2c19a7b8ef..1ce1af2899df 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -304,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
304 304
305struct _cache_attr { 305struct _cache_attr {
306 struct attribute attr; 306 struct attribute attr;
307 ssize_t (*show)(struct _cpuid4_info *, char *); 307 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
309 unsigned int);
309}; 310};
310 311
311#ifdef CONFIG_AMD_NB 312#ifdef CONFIG_AMD_NB
@@ -400,7 +401,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 401
401#define SHOW_CACHE_DISABLE(slot) \ 402#define SHOW_CACHE_DISABLE(slot) \
402static ssize_t \ 403static ssize_t \
403show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ 404show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
405 unsigned int cpu) \
404{ \ 406{ \
405 return show_cache_disable(this_leaf, buf, slot); \ 407 return show_cache_disable(this_leaf, buf, slot); \
406} 408}
@@ -512,7 +514,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
512#define STORE_CACHE_DISABLE(slot) \ 514#define STORE_CACHE_DISABLE(slot) \
513static ssize_t \ 515static ssize_t \
514store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 516store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
515 const char *buf, size_t count) \ 517 const char *buf, size_t count, \
518 unsigned int cpu) \
516{ \ 519{ \
517 return store_cache_disable(this_leaf, buf, count, slot); \ 520 return store_cache_disable(this_leaf, buf, count, slot); \
518} 521}
@@ -524,6 +527,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
524static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 527static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
525 show_cache_disable_1, store_cache_disable_1); 528 show_cache_disable_1, store_cache_disable_1);
526 529
530static ssize_t
531show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
532{
533 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
534 return -EINVAL;
535
536 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
537}
538
539static ssize_t
540store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
541 unsigned int cpu)
542{
543 unsigned long val;
544
545 if (!capable(CAP_SYS_ADMIN))
546 return -EPERM;
547
548 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
549 return -EINVAL;
550
551 if (strict_strtoul(buf, 16, &val) < 0)
552 return -EINVAL;
553
554 if (amd_set_subcaches(cpu, val))
555 return -EINVAL;
556
557 return count;
558}
559
560static struct _cache_attr subcaches =
561 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
562
527#else /* CONFIG_AMD_NB */ 563#else /* CONFIG_AMD_NB */
528#define amd_init_l3_cache(x, y) 564#define amd_init_l3_cache(x, y)
529#endif /* CONFIG_AMD_NB */ 565#endif /* CONFIG_AMD_NB */
@@ -532,9 +568,9 @@ static int
532__cpuinit cpuid4_cache_lookup_regs(int index, 568__cpuinit cpuid4_cache_lookup_regs(int index,
533 struct _cpuid4_info_regs *this_leaf) 569 struct _cpuid4_info_regs *this_leaf)
534{ 570{
535 union _cpuid4_leaf_eax eax; 571 union _cpuid4_leaf_eax eax;
536 union _cpuid4_leaf_ebx ebx; 572 union _cpuid4_leaf_ebx ebx;
537 union _cpuid4_leaf_ecx ecx; 573 union _cpuid4_leaf_ecx ecx;
538 unsigned edx; 574 unsigned edx;
539 575
540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 576 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
@@ -732,11 +768,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
732 struct cpuinfo_x86 *c = &cpu_data(cpu); 768 struct cpuinfo_x86 *c = &cpu_data(cpu);
733 769
734 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 770 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
735 for_each_cpu(i, c->llc_shared_map) { 771 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
736 if (!per_cpu(ici_cpuid4_info, i)) 772 if (!per_cpu(ici_cpuid4_info, i))
737 continue; 773 continue;
738 this_leaf = CPUID4_INFO_IDX(i, index); 774 this_leaf = CPUID4_INFO_IDX(i, index);
739 for_each_cpu(sibling, c->llc_shared_map) { 775 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
740 if (!cpu_online(sibling)) 776 if (!cpu_online(sibling))
741 continue; 777 continue;
742 set_bit(sibling, this_leaf->shared_cpu_map); 778 set_bit(sibling, this_leaf->shared_cpu_map);
@@ -870,8 +906,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
870#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) 906#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
871 907
872#define show_one_plus(file_name, object, val) \ 908#define show_one_plus(file_name, object, val) \
873static ssize_t show_##file_name \ 909static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
874 (struct _cpuid4_info *this_leaf, char *buf) \ 910 unsigned int cpu) \
875{ \ 911{ \
876 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 912 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
877} 913}
@@ -882,7 +918,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
882show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); 918show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
883show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); 919show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
884 920
885static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 921static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
922 unsigned int cpu)
886{ 923{
887 return sprintf(buf, "%luK\n", this_leaf->size / 1024); 924 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
888} 925}
@@ -906,17 +943,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
906 return n; 943 return n;
907} 944}
908 945
909static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) 946static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
947 unsigned int cpu)
910{ 948{
911 return show_shared_cpu_map_func(leaf, 0, buf); 949 return show_shared_cpu_map_func(leaf, 0, buf);
912} 950}
913 951
914static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) 952static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
953 unsigned int cpu)
915{ 954{
916 return show_shared_cpu_map_func(leaf, 1, buf); 955 return show_shared_cpu_map_func(leaf, 1, buf);
917} 956}
918 957
919static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) 958static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
959 unsigned int cpu)
920{ 960{
921 switch (this_leaf->eax.split.type) { 961 switch (this_leaf->eax.split.type) {
922 case CACHE_TYPE_DATA: 962 case CACHE_TYPE_DATA:
@@ -974,6 +1014,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
974 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 1014 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
975 n += 2; 1015 n += 2;
976 1016
1017 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1018 n += 1;
1019
977 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); 1020 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
978 if (attrs == NULL) 1021 if (attrs == NULL)
979 return attrs = default_attrs; 1022 return attrs = default_attrs;
@@ -986,6 +1029,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
986 attrs[n++] = &cache_disable_1.attr; 1029 attrs[n++] = &cache_disable_1.attr;
987 } 1030 }
988 1031
1032 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1033 attrs[n++] = &subcaches.attr;
1034
989 return attrs; 1035 return attrs;
990} 1036}
991#endif 1037#endif
@@ -998,7 +1044,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
998 1044
999 ret = fattr->show ? 1045 ret = fattr->show ?
1000 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1046 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1001 buf) : 1047 buf, this_leaf->cpu) :
1002 0; 1048 0;
1003 return ret; 1049 return ret;
1004} 1050}
@@ -1012,7 +1058,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
1012 1058
1013 ret = fattr->store ? 1059 ret = fattr->store ?
1014 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1060 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1015 buf, count) : 1061 buf, count, this_leaf->cpu) :
1016 0; 1062 0;
1017 return ret; 1063 return ret;
1018} 1064}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
106ssize_t apei_read_mce(struct mce *m, u64 *record_id) 106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{ 107{
108 struct cper_mce_record rcd; 108 struct cper_mce_record rcd;
109 ssize_t len; 109 int rc, pos;
110 110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd)); 111 rc = erst_get_record_id_begin(&pos);
112 if (len <= 0) 112 if (rc)
113 return len; 113 return rc;
114 /* Can not skip other records in storage via ERST unless clear them */ 114retry:
115 else if (len != sizeof(rcd) || 115 rc = erst_get_record_id_next(&pos, record_id);
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { 116 if (rc)
117 if (printk_ratelimit()) 117 goto out;
118 pr_warning( 118 /* no more record */
119 "MCE-APEI: Can not skip the unknown record in ERST"); 119 if (*record_id == APEI_ERST_INVALID_RECORD_ID)
120 return -EIO; 120 goto out;
121 } 121 rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
122 122 /* someone else has cleared the record, try next one */
123 if (rc == -ENOENT)
124 goto retry;
125 else if (rc < 0)
126 goto out;
127 /* try to skip other type records in storage */
128 else if (rc != sizeof(rcd) ||
129 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
130 goto retry;
123 memcpy(m, &rcd.mce, sizeof(*m)); 131 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id; 132 rc = sizeof(*m);
133out:
134 erst_get_record_id_end();
125 135
126 return sizeof(*m); 136 return rc;
127} 137}
128 138
129/* Check whether there is record in ERST */ 139/* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a77971979564..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -32,7 +32,7 @@ static void inject_mce(struct mce *m)
32{ 32{
33 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
34 34
35 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
36 i->finished = 0; 36 i->finished = 0;
37 mb(); 37 mb();
38 m->finished = 0; 38 m->finished = 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183b7f9c..3385ea26f684 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/syscore_ops.h>
24#include <linux/delay.h> 25#include <linux/delay.h>
25#include <linux/ctype.h> 26#include <linux/ctype.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
@@ -881,7 +882,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 882 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 883 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 884 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 885 * parser). So only support physical addresses up to page granuality for now.
885 */ 886 */
886static int mce_usable_address(struct mce *m) 887static int mce_usable_address(struct mce *m)
887{ 888{
@@ -1625,7 +1626,7 @@ out:
1625static unsigned int mce_poll(struct file *file, poll_table *wait) 1626static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{ 1627{
1627 poll_wait(file, &mce_wait, wait); 1628 poll_wait(file, &mce_wait, wait);
1628 if (rcu_dereference_check_mce(mcelog.next)) 1629 if (rcu_access_index(mcelog.next))
1629 return POLLIN | POLLRDNORM; 1630 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce()) 1631 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM; 1632 return POLLIN | POLLRDNORM;
@@ -1749,14 +1750,14 @@ static int mce_disable_error_reporting(void)
1749 return 0; 1750 return 0;
1750} 1751}
1751 1752
1752static int mce_suspend(struct sys_device *dev, pm_message_t state) 1753static int mce_suspend(void)
1753{ 1754{
1754 return mce_disable_error_reporting(); 1755 return mce_disable_error_reporting();
1755} 1756}
1756 1757
1757static int mce_shutdown(struct sys_device *dev) 1758static void mce_shutdown(void)
1758{ 1759{
1759 return mce_disable_error_reporting(); 1760 mce_disable_error_reporting();
1760} 1761}
1761 1762
1762/* 1763/*
@@ -1764,14 +1765,18 @@ static int mce_shutdown(struct sys_device *dev)
1764 * Only one CPU is active at this time, the others get re-added later using 1765 * Only one CPU is active at this time, the others get re-added later using
1765 * CPU hotplug: 1766 * CPU hotplug:
1766 */ 1767 */
1767static int mce_resume(struct sys_device *dev) 1768static void mce_resume(void)
1768{ 1769{
1769 __mcheck_cpu_init_generic(); 1770 __mcheck_cpu_init_generic();
1770 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1771 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1771
1772 return 0;
1773} 1772}
1774 1773
1774static struct syscore_ops mce_syscore_ops = {
1775 .suspend = mce_suspend,
1776 .shutdown = mce_shutdown,
1777 .resume = mce_resume,
1778};
1779
1775static void mce_cpu_restart(void *data) 1780static void mce_cpu_restart(void *data)
1776{ 1781{
1777 del_timer_sync(&__get_cpu_var(mce_timer)); 1782 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1808,9 +1813,6 @@ static void mce_enable_ce(void *all)
1808} 1813}
1809 1814
1810static struct sysdev_class mce_sysclass = { 1815static struct sysdev_class mce_sysclass = {
1811 .suspend = mce_suspend,
1812 .shutdown = mce_shutdown,
1813 .resume = mce_resume,
1814 .name = "machinecheck", 1816 .name = "machinecheck",
1815}; 1817};
1816 1818
@@ -2139,6 +2141,7 @@ static __init int mcheck_init_device(void)
2139 return err; 2141 return err;
2140 } 2142 }
2141 2143
2144 register_syscore_ops(&mce_syscore_ops);
2142 register_hotcpu_notifier(&mce_cpu_notifier); 2145 register_hotcpu_notifier(&mce_cpu_notifier);
2143 misc_register(&mce_log_device); 2146 misc_register(&mce_log_device);
2144 2147
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac52aca..167f97b5596e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 int i, err = 0; 527 int i, err = 0;
528 struct threshold_bank *b = NULL; 528 struct threshold_bank *b = NULL;
529 char name[32]; 529 char name[32];
530#ifdef CONFIG_SMP
531 struct cpuinfo_x86 *c = &cpu_data(cpu);
532#endif
533 530
534 sprintf(name, "threshold_bank%i", bank); 531 sprintf(name, "threshold_bank%i", bank);
535 532
536#ifdef CONFIG_SMP 533#ifdef CONFIG_SMP
537 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 534 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
538 i = cpumask_first(c->llc_shared_map); 535 i = cpumask_first(cpu_llc_shared_mask(cpu));
539 536
540 /* first core not up yet */ 537 /* first core not up yet */
541 if (cpu_data(i).cpu_core_id) 538 if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
555 if (err) 552 if (err)
556 goto out; 553 goto out;
557 554
558 cpumask_copy(b->cpus, c->llc_shared_map); 555 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
559 per_cpu(threshold_banks, cpu)[bank] = b; 556 per_cpu(threshold_banks, cpu)[bank] = b;
560 557
561 goto out; 558 goto out;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9f27228ceffd..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index bebabec5b448..929739a653d1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
45#include <linux/cpu.h> 45#include <linux/cpu.h>
46#include <linux/pci.h> 46#include <linux/pci.h>
47#include <linux/smp.h> 47#include <linux/smp.h>
48#include <linux/syscore_ops.h>
48 49
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/e820.h> 51#include <asm/e820.h>
@@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
292 293
293 /* 294 /*
294 * HACK! 295 * HACK!
295 * We use this same function to initialize the mtrrs on boot. 296 *
296 * The state of the boot cpu's mtrrs has been saved, and we want 297 * We use this same function to initialize the mtrrs during boot,
297 * to replicate across all the APs. 298 * resume, runtime cpu online and on an explicit request to set a
298 * If we're doing that @reg is set to something special... 299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
299 */ 310 */
300 if (reg != ~0U) 311 if (reg != ~0U)
301 mtrr_if->set(reg, base, size, type); 312 mtrr_if->set(reg, base, size, type);
302 else if (!mtrr_aps_delayed_init) 313 else
303 mtrr_if->set_all(); 314 mtrr_if->set_all();
304 315
305 /* Wait for the others */ 316 /* Wait for the others */
@@ -630,7 +641,7 @@ struct mtrr_value {
630 641
631static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 642static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
632 643
633static int mtrr_save(struct sys_device *sysdev, pm_message_t state) 644static int mtrr_save(void)
634{ 645{
635 int i; 646 int i;
636 647
@@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
642 return 0; 653 return 0;
643} 654}
644 655
645static int mtrr_restore(struct sys_device *sysdev) 656static void mtrr_restore(void)
646{ 657{
647 int i; 658 int i;
648 659
@@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev)
653 mtrr_value[i].ltype); 664 mtrr_value[i].ltype);
654 } 665 }
655 } 666 }
656 return 0;
657} 667}
658 668
659 669
660 670
661static struct sysdev_driver mtrr_sysdev_driver = { 671static struct syscore_ops mtrr_syscore_ops = {
662 .suspend = mtrr_save, 672 .suspend = mtrr_save,
663 .resume = mtrr_restore, 673 .resume = mtrr_restore,
664}; 674};
@@ -839,7 +849,7 @@ static int __init mtrr_init_finialize(void)
839 * TBD: is there any system with such CPU which supports 849 * TBD: is there any system with such CPU which supports
840 * suspend/resume? If no, we should remove the code. 850 * suspend/resume? If no, we should remove the code.
841 */ 851 */
842 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); 852 register_syscore_ops(&mtrr_syscore_ops);
843 853
844 return 0; 854 return 0;
845} 855}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea693..e638689279d3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,7 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
33 34
34#if 0 35#if 0
35#undef wrmsrl 36#undef wrmsrl
@@ -93,6 +94,8 @@ struct amd_nb {
93 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
94}; 95};
95 96
97struct intel_percore;
98
96#define MAX_LBR_ENTRIES 16 99#define MAX_LBR_ENTRIES 16
97 100
98struct cpu_hw_events { 101struct cpu_hw_events {
@@ -128,6 +131,13 @@ struct cpu_hw_events {
128 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 131 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
129 132
130 /* 133 /*
134 * Intel percore register state.
135 * Coordinate shared resources between HT threads.
136 */
137 int percore_used; /* Used by this CPU? */
138 struct intel_percore *per_core;
139
140 /*
131 * AMD specific bits 141 * AMD specific bits
132 */ 142 */
133 struct amd_nb *amd_nb; 143 struct amd_nb *amd_nb;
@@ -166,7 +176,7 @@ struct cpu_hw_events {
166/* 176/*
167 * Constraint on the Event code + UMask 177 * Constraint on the Event code + UMask
168 */ 178 */
169#define PEBS_EVENT_CONSTRAINT(c, n) \ 179#define INTEL_UEVENT_CONSTRAINT(c, n) \
170 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 180 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
171 181
172#define EVENT_CONSTRAINT_END \ 182#define EVENT_CONSTRAINT_END \
@@ -175,6 +185,28 @@ struct cpu_hw_events {
175#define for_each_event_constraint(e, c) \ 185#define for_each_event_constraint(e, c) \
176 for ((e) = (c); (e)->weight; (e)++) 186 for ((e) = (c); (e)->weight; (e)++)
177 187
188/*
189 * Extra registers for specific events.
190 * Some events need large masks and require external MSRs.
191 * Define a mapping to these extra registers.
192 */
193struct extra_reg {
194 unsigned int event;
195 unsigned int msr;
196 u64 config_mask;
197 u64 valid_mask;
198};
199
200#define EVENT_EXTRA_REG(e, ms, m, vm) { \
201 .event = (e), \
202 .msr = (ms), \
203 .config_mask = (m), \
204 .valid_mask = (vm), \
205 }
206#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
207 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
208#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
209
178union perf_capabilities { 210union perf_capabilities {
179 struct { 211 struct {
180 u64 lbr_format : 6; 212 u64 lbr_format : 6;
@@ -219,6 +251,7 @@ struct x86_pmu {
219 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 251 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
220 struct perf_event *event); 252 struct perf_event *event);
221 struct event_constraint *event_constraints; 253 struct event_constraint *event_constraints;
254 struct event_constraint *percore_constraints;
222 void (*quirks)(void); 255 void (*quirks)(void);
223 int perfctr_second_write; 256 int perfctr_second_write;
224 257
@@ -247,6 +280,11 @@ struct x86_pmu {
247 */ 280 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 281 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 282 int lbr_nr; /* hardware stack size */
283
284 /*
285 * Extra registers for events
286 */
287 struct extra_reg *extra_regs;
250}; 288};
251 289
252static struct x86_pmu x86_pmu __read_mostly; 290static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +309,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 309 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 310 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 311 [PERF_COUNT_HW_CACHE_RESULT_MAX];
312static u64 __read_mostly hw_cache_extra_regs
313 [PERF_COUNT_HW_CACHE_MAX]
314 [PERF_COUNT_HW_CACHE_OP_MAX]
315 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 316
275/* 317/*
276 * Propagate event elapsed time into the generic event. 318 * Propagate event elapsed time into the generic event.
@@ -298,7 +340,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 340 */
299again: 341again:
300 prev_raw_count = local64_read(&hwc->prev_count); 342 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 343 rdmsrl(hwc->event_base, new_raw_count);
302 344
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 345 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 346 new_raw_count) != prev_raw_count)
@@ -321,6 +363,49 @@ again:
321 return new_raw_count; 363 return new_raw_count;
322} 364}
323 365
366/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
367static inline int x86_pmu_addr_offset(int index)
368{
369 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
370 return index << 1;
371 return index;
372}
373
374static inline unsigned int x86_pmu_config_addr(int index)
375{
376 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
377}
378
379static inline unsigned int x86_pmu_event_addr(int index)
380{
381 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
382}
383
384/*
385 * Find and validate any extra registers to set up.
386 */
387static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
388{
389 struct extra_reg *er;
390
391 event->hw.extra_reg = 0;
392 event->hw.extra_config = 0;
393
394 if (!x86_pmu.extra_regs)
395 return 0;
396
397 for (er = x86_pmu.extra_regs; er->msr; er++) {
398 if (er->event != (config & er->config_mask))
399 continue;
400 if (event->attr.config1 & ~er->valid_mask)
401 return -EINVAL;
402 event->hw.extra_reg = er->msr;
403 event->hw.extra_config = event->attr.config1;
404 break;
405 }
406 return 0;
407}
408
324static atomic_t active_events; 409static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 410static DEFINE_MUTEX(pmc_reserve_mutex);
326 411
@@ -331,12 +416,12 @@ static bool reserve_pmc_hardware(void)
331 int i; 416 int i;
332 417
333 for (i = 0; i < x86_pmu.num_counters; i++) { 418 for (i = 0; i < x86_pmu.num_counters; i++) {
334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 419 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
335 goto perfctr_fail; 420 goto perfctr_fail;
336 } 421 }
337 422
338 for (i = 0; i < x86_pmu.num_counters; i++) { 423 for (i = 0; i < x86_pmu.num_counters; i++) {
339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 424 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
340 goto eventsel_fail; 425 goto eventsel_fail;
341 } 426 }
342 427
@@ -344,13 +429,13 @@ static bool reserve_pmc_hardware(void)
344 429
345eventsel_fail: 430eventsel_fail:
346 for (i--; i >= 0; i--) 431 for (i--; i >= 0; i--)
347 release_evntsel_nmi(x86_pmu.eventsel + i); 432 release_evntsel_nmi(x86_pmu_config_addr(i));
348 433
349 i = x86_pmu.num_counters; 434 i = x86_pmu.num_counters;
350 435
351perfctr_fail: 436perfctr_fail:
352 for (i--; i >= 0; i--) 437 for (i--; i >= 0; i--)
353 release_perfctr_nmi(x86_pmu.perfctr + i); 438 release_perfctr_nmi(x86_pmu_event_addr(i));
354 439
355 return false; 440 return false;
356} 441}
@@ -360,8 +445,8 @@ static void release_pmc_hardware(void)
360 int i; 445 int i;
361 446
362 for (i = 0; i < x86_pmu.num_counters; i++) { 447 for (i = 0; i < x86_pmu.num_counters; i++) {
363 release_perfctr_nmi(x86_pmu.perfctr + i); 448 release_perfctr_nmi(x86_pmu_event_addr(i));
364 release_evntsel_nmi(x86_pmu.eventsel + i); 449 release_evntsel_nmi(x86_pmu_config_addr(i));
365 } 450 }
366} 451}
367 452
@@ -382,7 +467,7 @@ static bool check_hw_exists(void)
382 * complain and bail. 467 * complain and bail.
383 */ 468 */
384 for (i = 0; i < x86_pmu.num_counters; i++) { 469 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i; 470 reg = x86_pmu_config_addr(i);
386 ret = rdmsrl_safe(reg, &val); 471 ret = rdmsrl_safe(reg, &val);
387 if (ret) 472 if (ret)
388 goto msr_fail; 473 goto msr_fail;
@@ -407,20 +492,25 @@ static bool check_hw_exists(void)
407 * that don't trap on the MSR access and always return 0s. 492 * that don't trap on the MSR access and always return 0s.
408 */ 493 */
409 val = 0xabcdUL; 494 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val); 495 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); 496 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
412 if (ret || val != val_new) 497 if (ret || val != val_new)
413 goto msr_fail; 498 goto msr_fail;
414 499
415 return true; 500 return true;
416 501
417bios_fail: 502bios_fail:
418 printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); 503 /*
504 * We still allow the PMU driver to operate:
505 */
506 printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
419 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); 507 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
420 return false; 508
509 return true;
421 510
422msr_fail: 511msr_fail:
423 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 512 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
513
424 return false; 514 return false;
425} 515}
426 516
@@ -442,8 +532,9 @@ static inline int x86_pmu_initialized(void)
442} 532}
443 533
444static inline int 534static inline int
445set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 535set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
446{ 536{
537 struct perf_event_attr *attr = &event->attr;
447 unsigned int cache_type, cache_op, cache_result; 538 unsigned int cache_type, cache_op, cache_result;
448 u64 config, val; 539 u64 config, val;
449 540
@@ -470,8 +561,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
470 return -EINVAL; 561 return -EINVAL;
471 562
472 hwc->config |= val; 563 hwc->config |= val;
473 564 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
474 return 0; 565 return x86_pmu_extra_regs(val, event);
475} 566}
476 567
477static int x86_setup_perfctr(struct perf_event *event) 568static int x86_setup_perfctr(struct perf_event *event)
@@ -495,11 +586,15 @@ static int x86_setup_perfctr(struct perf_event *event)
495 return -EOPNOTSUPP; 586 return -EOPNOTSUPP;
496 } 587 }
497 588
589 /*
590 * Do not allow config1 (extended registers) to propagate,
591 * there's no sane user-space generalization yet:
592 */
498 if (attr->type == PERF_TYPE_RAW) 593 if (attr->type == PERF_TYPE_RAW)
499 return 0; 594 return 0;
500 595
501 if (attr->type == PERF_TYPE_HW_CACHE) 596 if (attr->type == PERF_TYPE_HW_CACHE)
502 return set_ext_hw_attr(hwc, attr); 597 return set_ext_hw_attr(hwc, event);
503 598
504 if (attr->config >= x86_pmu.max_events) 599 if (attr->config >= x86_pmu.max_events)
505 return -EINVAL; 600 return -EINVAL;
@@ -518,8 +613,8 @@ static int x86_setup_perfctr(struct perf_event *event)
518 /* 613 /*
519 * Branch tracing: 614 * Branch tracing:
520 */ 615 */
521 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 616 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
522 (hwc->sample_period == 1)) { 617 !attr->freq && hwc->sample_period == 1) {
523 /* BTS is not supported by this architecture. */ 618 /* BTS is not supported by this architecture. */
524 if (!x86_pmu.bts_active) 619 if (!x86_pmu.bts_active)
525 return -EOPNOTSUPP; 620 return -EOPNOTSUPP;
@@ -617,11 +712,11 @@ static void x86_pmu_disable_all(void)
617 712
618 if (!test_bit(idx, cpuc->active_mask)) 713 if (!test_bit(idx, cpuc->active_mask))
619 continue; 714 continue;
620 rdmsrl(x86_pmu.eventsel + idx, val); 715 rdmsrl(x86_pmu_config_addr(idx), val);
621 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 716 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
622 continue; 717 continue;
623 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 718 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
624 wrmsrl(x86_pmu.eventsel + idx, val); 719 wrmsrl(x86_pmu_config_addr(idx), val);
625 } 720 }
626} 721}
627 722
@@ -642,21 +737,26 @@ static void x86_pmu_disable(struct pmu *pmu)
642 x86_pmu.disable_all(); 737 x86_pmu.disable_all();
643} 738}
644 739
740static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
741 u64 enable_mask)
742{
743 if (hwc->extra_reg)
744 wrmsrl(hwc->extra_reg, hwc->extra_config);
745 wrmsrl(hwc->config_base, hwc->config | enable_mask);
746}
747
645static void x86_pmu_enable_all(int added) 748static void x86_pmu_enable_all(int added)
646{ 749{
647 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 750 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
648 int idx; 751 int idx;
649 752
650 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 753 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
651 struct perf_event *event = cpuc->events[idx]; 754 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
652 u64 val;
653 755
654 if (!test_bit(idx, cpuc->active_mask)) 756 if (!test_bit(idx, cpuc->active_mask))
655 continue; 757 continue;
656 758
657 val = event->hw.config; 759 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
658 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
659 wrmsrl(x86_pmu.eventsel + idx, val);
660 } 760 }
661} 761}
662 762
@@ -821,15 +921,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
821 hwc->event_base = 0; 921 hwc->event_base = 0;
822 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 922 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
823 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 923 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
824 /* 924 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
825 * We set it so that event_base + idx in wrmsr/rdmsr maps to
826 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
827 */
828 hwc->event_base =
829 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
830 } else { 925 } else {
831 hwc->config_base = x86_pmu.eventsel; 926 hwc->config_base = x86_pmu_config_addr(hwc->idx);
832 hwc->event_base = x86_pmu.perfctr; 927 hwc->event_base = x86_pmu_event_addr(hwc->idx);
833 } 928 }
834} 929}
835 930
@@ -915,17 +1010,11 @@ static void x86_pmu_enable(struct pmu *pmu)
915 x86_pmu.enable_all(added); 1010 x86_pmu.enable_all(added);
916} 1011}
917 1012
918static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
919 u64 enable_mask)
920{
921 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
922}
923
924static inline void x86_pmu_disable_event(struct perf_event *event) 1013static inline void x86_pmu_disable_event(struct perf_event *event)
925{ 1014{
926 struct hw_perf_event *hwc = &event->hw; 1015 struct hw_perf_event *hwc = &event->hw;
927 1016
928 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1017 wrmsrl(hwc->config_base, hwc->config);
929} 1018}
930 1019
931static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1020static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -978,7 +1067,7 @@ x86_perf_event_set_period(struct perf_event *event)
978 */ 1067 */
979 local64_set(&hwc->prev_count, (u64)-left); 1068 local64_set(&hwc->prev_count, (u64)-left);
980 1069
981 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1070 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
982 1071
983 /* 1072 /*
984 * Due to erratum on certan cpu we need 1073 * Due to erratum on certan cpu we need
@@ -986,7 +1075,7 @@ x86_perf_event_set_period(struct perf_event *event)
986 * is updated properly 1075 * is updated properly
987 */ 1076 */
988 if (x86_pmu.perfctr_second_write) { 1077 if (x86_pmu.perfctr_second_write) {
989 wrmsrl(hwc->event_base + idx, 1078 wrmsrl(hwc->event_base,
990 (u64)(-left) & x86_pmu.cntval_mask); 1079 (u64)(-left) & x86_pmu.cntval_mask);
991 } 1080 }
992 1081
@@ -1029,7 +1118,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1029 1118
1030 /* 1119 /*
1031 * If group events scheduling transaction was started, 1120 * If group events scheduling transaction was started,
1032 * skip the schedulability test here, it will be peformed 1121 * skip the schedulability test here, it will be performed
1033 * at commit time (->commit_txn) as a whole 1122 * at commit time (->commit_txn) as a whole
1034 */ 1123 */
1035 if (cpuc->group_flag & PERF_EVENT_TXN) 1124 if (cpuc->group_flag & PERF_EVENT_TXN)
@@ -1113,8 +1202,8 @@ void perf_event_print_debug(void)
1113 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1202 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1114 1203
1115 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1204 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1116 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1205 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1117 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1206 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1118 1207
1119 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1208 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1120 1209
@@ -1199,6 +1288,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1199 1288
1200 cpuc = &__get_cpu_var(cpu_hw_events); 1289 cpuc = &__get_cpu_var(cpu_hw_events);
1201 1290
1291 /*
1292 * Some chipsets need to unmask the LVTPC in a particular spot
1293 * inside the nmi handler. As a result, the unmasking was pushed
1294 * into all the nmi handlers.
1295 *
1296 * This generic handler doesn't seem to have any issues where the
1297 * unmasking occurs so it was left at the top.
1298 */
1299 apic_write(APIC_LVTPC, APIC_DM_NMI);
1300
1202 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1301 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1203 if (!test_bit(idx, cpuc->active_mask)) { 1302 if (!test_bit(idx, cpuc->active_mask)) {
1204 /* 1303 /*
@@ -1285,8 +1384,6 @@ perf_event_nmi_handler(struct notifier_block *self,
1285 return NOTIFY_DONE; 1384 return NOTIFY_DONE;
1286 } 1385 }
1287 1386
1288 apic_write(APIC_LVTPC, APIC_DM_NMI);
1289
1290 handled = x86_pmu.handle_irq(args->regs); 1387 handled = x86_pmu.handle_irq(args->regs);
1291 if (!handled) 1388 if (!handled)
1292 return NOTIFY_DONE; 1389 return NOTIFY_DONE;
@@ -1389,7 +1486,7 @@ static void __init pmu_check_apic(void)
1389 pr_info("no hardware sampling interrupt available.\n"); 1486 pr_info("no hardware sampling interrupt available.\n");
1390} 1487}
1391 1488
1392int __init init_hw_perf_events(void) 1489static int __init init_hw_perf_events(void)
1393{ 1490{
1394 struct event_constraint *c; 1491 struct event_constraint *c;
1395 int err; 1492 int err;
@@ -1608,7 +1705,7 @@ out:
1608 return ret; 1705 return ret;
1609} 1706}
1610 1707
1611int x86_pmu_event_init(struct perf_event *event) 1708static int x86_pmu_event_init(struct perf_event *event)
1612{ 1709{
1613 struct pmu *tmp; 1710 struct pmu *tmp;
1614 int err; 1711 int err;
@@ -1710,7 +1807,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1710 1807
1711 perf_callchain_store(entry, regs->ip); 1808 perf_callchain_store(entry, regs->ip);
1712 1809
1713 dump_trace(NULL, regs, NULL, &backtrace_ops, entry); 1810 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1714} 1811}
1715 1812
1716#ifdef CONFIG_COMPAT 1813#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67e2202a6039..cf4e369cea67 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
8 [ C(L1D) ] = { 8 [ C(L1D) ] = {
9 [ C(OP_READ) ] = { 9 [ C(OP_READ) ] = {
10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
11 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 11 [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
12 }, 12 },
13 [ C(OP_WRITE) ] = { 13 [ C(OP_WRITE) ] = {
14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ 14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
127/* 127/*
128 * AMD64 events are detected based on their event codes. 128 * AMD64 events are detected based on their event codes.
129 */ 129 */
130static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
131{
132 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
133}
134
130static inline int amd_is_nb_event(struct hw_perf_event *hwc) 135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
131{ 136{
132 return (hwc->config & 0xe0) == 0xe0; 137 return (hwc->config & 0xe0) == 0xe0;
@@ -385,13 +390,195 @@ static __initconst const struct x86_pmu amd_pmu = {
385 .cpu_dead = amd_pmu_cpu_dead, 390 .cpu_dead = amd_pmu_cpu_dead,
386}; 391};
387 392
393/* AMD Family 15h */
394
395#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
396
397#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
398#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
399#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
400#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
401#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
402#define AMD_EVENT_EX_LS 0x000000C0ULL
403#define AMD_EVENT_DE 0x000000D0ULL
404#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
405
406/*
407 * AMD family 15h event code/PMC mappings:
408 *
409 * type = event_code & 0x0F0:
410 *
411 * 0x000 FP PERF_CTL[5:3]
412 * 0x010 FP PERF_CTL[5:3]
413 * 0x020 LS PERF_CTL[5:0]
414 * 0x030 LS PERF_CTL[5:0]
415 * 0x040 DC PERF_CTL[5:0]
416 * 0x050 DC PERF_CTL[5:0]
417 * 0x060 CU PERF_CTL[2:0]
418 * 0x070 CU PERF_CTL[2:0]
419 * 0x080 IC/DE PERF_CTL[2:0]
420 * 0x090 IC/DE PERF_CTL[2:0]
421 * 0x0A0 ---
422 * 0x0B0 ---
423 * 0x0C0 EX/LS PERF_CTL[5:0]
424 * 0x0D0 DE PERF_CTL[2:0]
425 * 0x0E0 NB NB_PERF_CTL[3:0]
426 * 0x0F0 NB NB_PERF_CTL[3:0]
427 *
428 * Exceptions:
429 *
430 * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
431 * 0x003 FP PERF_CTL[3]
432 * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
433 * 0x00B FP PERF_CTL[3]
434 * 0x00D FP PERF_CTL[3]
435 * 0x023 DE PERF_CTL[2:0]
436 * 0x02D LS PERF_CTL[3]
437 * 0x02E LS PERF_CTL[3,0]
438 * 0x043 CU PERF_CTL[2:0]
439 * 0x045 CU PERF_CTL[2:0]
440 * 0x046 CU PERF_CTL[2:0]
441 * 0x054 CU PERF_CTL[2:0]
442 * 0x055 CU PERF_CTL[2:0]
443 * 0x08F IC PERF_CTL[0]
444 * 0x187 DE PERF_CTL[0]
445 * 0x188 DE PERF_CTL[0]
446 * 0x0DB EX PERF_CTL[5:0]
447 * 0x0DC LS PERF_CTL[5:0]
448 * 0x0DD LS PERF_CTL[5:0]
449 * 0x0DE LS PERF_CTL[5:0]
450 * 0x0DF LS PERF_CTL[5:0]
451 * 0x1D6 EX PERF_CTL[5:0]
452 * 0x1D8 EX PERF_CTL[5:0]
453 *
454 * (*) depending on the umask all FPU counters may be used
455 */
456
457static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
458static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
459static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
460static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
461static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
462static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
463
464static struct event_constraint *
465amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
466{
467 struct hw_perf_event *hwc = &event->hw;
468 unsigned int event_code = amd_get_event_code(hwc);
469
470 switch (event_code & AMD_EVENT_TYPE_MASK) {
471 case AMD_EVENT_FP:
472 switch (event_code) {
473 case 0x000:
474 if (!(hwc->config & 0x0000F000ULL))
475 break;
476 if (!(hwc->config & 0x00000F00ULL))
477 break;
478 return &amd_f15_PMC3;
479 case 0x004:
480 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
481 break;
482 return &amd_f15_PMC3;
483 case 0x003:
484 case 0x00B:
485 case 0x00D:
486 return &amd_f15_PMC3;
487 }
488 return &amd_f15_PMC53;
489 case AMD_EVENT_LS:
490 case AMD_EVENT_DC:
491 case AMD_EVENT_EX_LS:
492 switch (event_code) {
493 case 0x023:
494 case 0x043:
495 case 0x045:
496 case 0x046:
497 case 0x054:
498 case 0x055:
499 return &amd_f15_PMC20;
500 case 0x02D:
501 return &amd_f15_PMC3;
502 case 0x02E:
503 return &amd_f15_PMC30;
504 default:
505 return &amd_f15_PMC50;
506 }
507 case AMD_EVENT_CU:
508 case AMD_EVENT_IC_DE:
509 case AMD_EVENT_DE:
510 switch (event_code) {
511 case 0x08F:
512 case 0x187:
513 case 0x188:
514 return &amd_f15_PMC0;
515 case 0x0DB ... 0x0DF:
516 case 0x1D6:
517 case 0x1D8:
518 return &amd_f15_PMC50;
519 default:
520 return &amd_f15_PMC20;
521 }
522 case AMD_EVENT_NB:
523 /* not yet implemented */
524 return &emptyconstraint;
525 default:
526 return &emptyconstraint;
527 }
528}
529
530static __initconst const struct x86_pmu amd_pmu_f15h = {
531 .name = "AMD Family 15h",
532 .handle_irq = x86_pmu_handle_irq,
533 .disable_all = x86_pmu_disable_all,
534 .enable_all = x86_pmu_enable_all,
535 .enable = x86_pmu_enable_event,
536 .disable = x86_pmu_disable_event,
537 .hw_config = amd_pmu_hw_config,
538 .schedule_events = x86_schedule_events,
539 .eventsel = MSR_F15H_PERF_CTL,
540 .perfctr = MSR_F15H_PERF_CTR,
541 .event_map = amd_pmu_event_map,
542 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
543 .num_counters = 6,
544 .cntval_bits = 48,
545 .cntval_mask = (1ULL << 48) - 1,
546 .apic = 1,
547 /* use highest bit to detect overflow */
548 .max_period = (1ULL << 47) - 1,
549 .get_event_constraints = amd_get_event_constraints_f15h,
550 /* nortbridge counters not yet implemented: */
551#if 0
552 .put_event_constraints = amd_put_event_constraints,
553
554 .cpu_prepare = amd_pmu_cpu_prepare,
555 .cpu_starting = amd_pmu_cpu_starting,
556 .cpu_dead = amd_pmu_cpu_dead,
557#endif
558};
559
388static __init int amd_pmu_init(void) 560static __init int amd_pmu_init(void)
389{ 561{
390 /* Performance-monitoring supported from K7 and later: */ 562 /* Performance-monitoring supported from K7 and later: */
391 if (boot_cpu_data.x86 < 6) 563 if (boot_cpu_data.x86 < 6)
392 return -ENODEV; 564 return -ENODEV;
393 565
394 x86_pmu = amd_pmu; 566 /*
567 * If core performance counter extensions exists, it must be
568 * family 15h, otherwise fail. See x86_pmu_addr_offset().
569 */
570 switch (boot_cpu_data.x86) {
571 case 0x15:
572 if (!cpu_has_perfctr_core)
573 return -ENODEV;
574 x86_pmu = amd_pmu_f15h;
575 break;
576 default:
577 if (cpu_has_perfctr_core)
578 return -ENODEV;
579 x86_pmu = amd_pmu;
580 break;
581 }
395 582
396 /* Events are common for all AMDs */ 583 /* Events are common for all AMDs */
397 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 584 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c1d79c..447a28de6f09 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,9 +1,31 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
6static const u64 intel_perfmon_event_map[] = 28static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
7{ 29{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 30 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 31 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
89static struct extra_reg intel_nehalem_extra_regs[] =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
67static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +110,33 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
113static struct event_constraint intel_snb_event_constraints[] =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
79static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 [ C(OP_READ) ] = {
188 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
189 [ C(RESULT_ACCESS) ] = 0x01b7,
190 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
191 [ C(RESULT_MISS) ] = 0x01b7,
192 },
193 [ C(OP_WRITE) ] = {
194 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
195 [ C(RESULT_ACCESS) ] = 0x01b7,
196 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
197 [ C(RESULT_MISS) ] = 0x01b7,
198 },
199 [ C(OP_PREFETCH) ] = {
200 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
201 [ C(RESULT_ACCESS) ] = 0x01b7,
202 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
203 [ C(RESULT_MISS) ] = 0x01b7,
204 },
205 },
206 [ C(DTLB) ] = {
207 [ C(OP_READ) ] = {
208 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
209 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
210 },
211 [ C(OP_WRITE) ] = {
212 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
213 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
214 },
215 [ C(OP_PREFETCH) ] = {
216 [ C(RESULT_ACCESS) ] = 0x0,
217 [ C(RESULT_MISS) ] = 0x0,
218 },
219 },
220 [ C(ITLB) ] = {
221 [ C(OP_READ) ] = {
222 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
223 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
224 },
225 [ C(OP_WRITE) ] = {
226 [ C(RESULT_ACCESS) ] = -1,
227 [ C(RESULT_MISS) ] = -1,
228 },
229 [ C(OP_PREFETCH) ] = {
230 [ C(RESULT_ACCESS) ] = -1,
231 [ C(RESULT_MISS) ] = -1,
232 },
233 },
234 [ C(BPU ) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
237 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248};
249
92static __initconst const u64 westmere_hw_cache_event_ids 250static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 251 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 252 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 282 },
125 [ C(LL ) ] = { 283 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 284 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 285 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 286 [ C(RESULT_ACCESS) ] = 0x01b7,
287 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
288 [ C(RESULT_MISS) ] = 0x01b7,
129 }, 289 },
290 /*
291 * Use RFO, not WRITEBACK, because a write miss would typically occur
292 * on RFO.
293 */
130 [ C(OP_WRITE) ] = { 294 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 295 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 296 [ C(RESULT_ACCESS) ] = 0x01b7,
297 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
298 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 299 },
134 [ C(OP_PREFETCH) ] = { 300 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 301 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 302 [ C(RESULT_ACCESS) ] = 0x01b7,
303 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
304 [ C(RESULT_MISS) ] = 0x01b7,
137 }, 305 },
138 }, 306 },
139 [ C(DTLB) ] = { 307 [ C(DTLB) ] = {
@@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 348 },
181}; 349};
182 350
351/*
352 * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
353 * See IA32 SDM Vol 3B 30.6.1.3
354 */
355
356#define NHM_DMND_DATA_RD (1 << 0)
357#define NHM_DMND_RFO (1 << 1)
358#define NHM_DMND_IFETCH (1 << 2)
359#define NHM_DMND_WB (1 << 3)
360#define NHM_PF_DATA_RD (1 << 4)
361#define NHM_PF_DATA_RFO (1 << 5)
362#define NHM_PF_IFETCH (1 << 6)
363#define NHM_OFFCORE_OTHER (1 << 7)
364#define NHM_UNCORE_HIT (1 << 8)
365#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
366#define NHM_OTHER_CORE_HITM (1 << 10)
367 /* reserved */
368#define NHM_REMOTE_CACHE_FWD (1 << 12)
369#define NHM_REMOTE_DRAM (1 << 13)
370#define NHM_LOCAL_DRAM (1 << 14)
371#define NHM_NON_DRAM (1 << 15)
372
373#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
374
375#define NHM_DMND_READ (NHM_DMND_DATA_RD)
376#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
377#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
378
379#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
380#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
381#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
382
383static __initconst const u64 nehalem_hw_cache_extra_regs
384 [PERF_COUNT_HW_CACHE_MAX]
385 [PERF_COUNT_HW_CACHE_OP_MAX]
386 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
387{
388 [ C(LL ) ] = {
389 [ C(OP_READ) ] = {
390 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
391 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
392 },
393 [ C(OP_WRITE) ] = {
394 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
395 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
396 },
397 [ C(OP_PREFETCH) ] = {
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 },
401 }
402};
403
183static __initconst const u64 nehalem_hw_cache_event_ids 404static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 405 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 406 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
187{ 408{
188 [ C(L1D) ] = { 409 [ C(L1D) ] = {
189 [ C(OP_READ) ] = { 410 [ C(OP_READ) ] = {
190 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ 411 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
191 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ 412 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
192 }, 413 },
193 [ C(OP_WRITE) ] = { 414 [ C(OP_WRITE) ] = {
194 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ 415 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
195 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ 416 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
196 }, 417 },
197 [ C(OP_PREFETCH) ] = { 418 [ C(OP_PREFETCH) ] = {
198 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ 419 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 436 },
216 [ C(LL ) ] = { 437 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 438 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 439 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 440 [ C(RESULT_ACCESS) ] = 0x01b7,
441 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
442 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 443 },
444 /*
445 * Use RFO, not WRITEBACK, because a write miss would typically occur
446 * on RFO.
447 */
221 [ C(OP_WRITE) ] = { 448 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 449 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 450 [ C(RESULT_ACCESS) ] = 0x01b7,
451 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
452 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 453 },
225 [ C(OP_PREFETCH) ] = { 454 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 455 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 456 [ C(RESULT_ACCESS) ] = 0x01b7,
457 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
458 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 459 },
229 }, 460 },
230 [ C(DTLB) ] = { 461 [ C(DTLB) ] = {
@@ -691,8 +922,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 922 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 923
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 924 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 925 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 926 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 927 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 928 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 929 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -719,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
719 950
720 cpuc = &__get_cpu_var(cpu_hw_events); 951 cpuc = &__get_cpu_var(cpu_hw_events);
721 952
953 /*
954 * Some chipsets need to unmask the LVTPC in a particular spot
955 * inside the nmi handler. As a result, the unmasking was pushed
956 * into all the nmi handlers.
957 *
958 * This handler doesn't seem to have any issues with the unmasking
959 * so it was left at the top.
960 */
961 apic_write(APIC_LVTPC, APIC_DM_NMI);
962
722 intel_pmu_disable_all(); 963 intel_pmu_disable_all();
723 handled = intel_pmu_drain_bts_buffer(); 964 handled = intel_pmu_drain_bts_buffer();
724 status = intel_pmu_get_status(); 965 status = intel_pmu_get_status();
@@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
784 struct hw_perf_event *hwc = &event->hw; 1025 struct hw_perf_event *hwc = &event->hw;
785 unsigned int hw_event, bts_event; 1026 unsigned int hw_event, bts_event;
786 1027
1028 if (event->attr.freq)
1029 return NULL;
1030
787 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; 1031 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
788 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 1032 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
789 1033
@@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1038}
795 1039
796static struct event_constraint * 1040static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1042{
1043 struct hw_perf_event *hwc = &event->hw;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era;
1048 int i;
1049 int free_slot;
1050 int found;
1051
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1053 return NULL;
1054
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1056 if (e != c->code)
1057 continue;
1058
1059 /*
1060 * Allocate resource per core.
1061 */
1062 pc = cpuc->per_core;
1063 if (!pc)
1064 break;
1065 c = &emptyconstraint;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 }
1097
1098 return NULL;
1099}
1100
1101static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1102intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1103{
799 struct event_constraint *c; 1104 struct event_constraint *c;
@@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1111 if (c)
807 return c; 1112 return c;
808 1113
1114 c = intel_percore_constraints(cpuc, event);
1115 if (c)
1116 return c;
1117
809 return x86_get_event_constraints(cpuc, event); 1118 return x86_get_event_constraints(cpuc, event);
810} 1119}
811 1120
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event)
1123{
1124 struct extra_reg *er;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129
1130 if (!cpuc->percore_used)
1131 return;
1132
1133 for (er = x86_pmu.extra_regs; er->msr; er++) {
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136
1137 pc = cpuc->per_core;
1138 raw_spin_lock(&pc->lock);
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1140 era = &pc->regs[i];
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157}
1158
812static int intel_pmu_hw_config(struct perf_event *event) 1159static int intel_pmu_hw_config(struct perf_event *event)
813{ 1160{
814 int ret = x86_pmu_hw_config(event); 1161 int ret = x86_pmu_hw_config(event);
@@ -880,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = {
880 */ 1227 */
881 .max_period = (1ULL << 31) - 1, 1228 .max_period = (1ULL << 31) - 1,
882 .get_event_constraints = intel_get_event_constraints, 1229 .get_event_constraints = intel_get_event_constraints,
1230 .put_event_constraints = intel_put_event_constraints,
883 .event_constraints = intel_core_event_constraints, 1231 .event_constraints = intel_core_event_constraints,
884}; 1232};
885 1233
1234static int intel_pmu_cpu_prepare(int cpu)
1235{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237
1238 if (!cpu_has_ht_siblings())
1239 return NOTIFY_OK;
1240
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1242 GFP_KERNEL, cpu_to_node(cpu));
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD;
1245
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK;
1249}
1250
886static void intel_pmu_cpu_starting(int cpu) 1251static void intel_pmu_cpu_starting(int cpu)
887{ 1252{
1253 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1254 int core_id = topology_core_id(cpu);
1255 int i;
1256
888 init_debug_store_on_cpu(cpu); 1257 init_debug_store_on_cpu(cpu);
889 /* 1258 /*
890 * Deal with CPUs that don't clear their LBRs on power-up. 1259 * Deal with CPUs that don't clear their LBRs on power-up.
891 */ 1260 */
892 intel_pmu_lbr_reset(); 1261 intel_pmu_lbr_reset();
1262
1263 if (!cpu_has_ht_siblings())
1264 return;
1265
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1268
1269 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core);
1271 cpuc->per_core = pc;
1272 break;
1273 }
1274 }
1275
1276 cpuc->per_core->core_id = core_id;
1277 cpuc->per_core->refcnt++;
893} 1278}
894 1279
895static void intel_pmu_cpu_dying(int cpu) 1280static void intel_pmu_cpu_dying(int cpu)
896{ 1281{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core;
1284
1285 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc);
1288 cpuc->per_core = NULL;
1289 }
1290
897 fini_debug_store_on_cpu(cpu); 1291 fini_debug_store_on_cpu(cpu);
898} 1292}
899 1293
@@ -918,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = {
918 */ 1312 */
919 .max_period = (1ULL << 31) - 1, 1313 .max_period = (1ULL << 31) - 1,
920 .get_event_constraints = intel_get_event_constraints, 1314 .get_event_constraints = intel_get_event_constraints,
1315 .put_event_constraints = intel_put_event_constraints,
921 1316
1317 .cpu_prepare = intel_pmu_cpu_prepare,
922 .cpu_starting = intel_pmu_cpu_starting, 1318 .cpu_starting = intel_pmu_cpu_starting,
923 .cpu_dying = intel_pmu_cpu_dying, 1319 .cpu_dying = intel_pmu_cpu_dying,
924}; 1320};
@@ -939,7 +1335,7 @@ static void intel_clovertown_quirks(void)
939 * AJ106 could possibly be worked around by not allowing LBR 1335 * AJ106 could possibly be worked around by not allowing LBR
940 * usage from PEBS, including the fixup. 1336 * usage from PEBS, including the fixup.
941 * AJ68 could possibly be worked around by always programming 1337 * AJ68 could possibly be worked around by always programming
942 * a pebs_event_reset[0] value and coping with the lost events. 1338 * a pebs_event_reset[0] value and coping with the lost events.
943 * 1339 *
944 * But taken together it might just make sense to not enable PEBS on 1340 * But taken together it might just make sense to not enable PEBS on
945 * these chips. 1341 * these chips.
@@ -1024,6 +1420,7 @@ static __init int intel_pmu_init(void)
1024 intel_pmu_lbr_init_core(); 1420 intel_pmu_lbr_init_core();
1025 1421
1026 x86_pmu.event_constraints = intel_core2_event_constraints; 1422 x86_pmu.event_constraints = intel_core2_event_constraints;
1423 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1027 pr_cont("Core2 events, "); 1424 pr_cont("Core2 events, ");
1028 break; 1425 break;
1029 1426
@@ -1032,11 +1429,28 @@ static __init int intel_pmu_init(void)
1032 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1429 case 46: /* 45 nm nehalem-ex, "Beckton" */
1033 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1430 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1034 sizeof(hw_cache_event_ids)); 1431 sizeof(hw_cache_event_ids));
1432 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1433 sizeof(hw_cache_extra_regs));
1035 1434
1036 intel_pmu_lbr_init_nhm(); 1435 intel_pmu_lbr_init_nhm();
1037 1436
1038 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1437 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1039 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442
1443 if (ebx & 0x40) {
1444 /*
1445 * Erratum AAJ80 detected, we work it around by using
1446 * the BR_MISP_EXEC.ANY event. This will over-count
1447 * branch-misses, but it's still much better than the
1448 * architectural event which is often completely bogus:
1449 */
1450 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1451
1452 pr_cont("erratum AAJ80 worked around, ");
1453 }
1040 pr_cont("Nehalem events, "); 1454 pr_cont("Nehalem events, ");
1041 break; 1455 break;
1042 1456
@@ -1047,21 +1461,39 @@ static __init int intel_pmu_init(void)
1047 intel_pmu_lbr_init_atom(); 1461 intel_pmu_lbr_init_atom();
1048 1462
1049 x86_pmu.event_constraints = intel_gen_event_constraints; 1463 x86_pmu.event_constraints = intel_gen_event_constraints;
1464 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1050 pr_cont("Atom events, "); 1465 pr_cont("Atom events, ");
1051 break; 1466 break;
1052 1467
1053 case 37: /* 32 nm nehalem, "Clarkdale" */ 1468 case 37: /* 32 nm nehalem, "Clarkdale" */
1054 case 44: /* 32 nm nehalem, "Gulftown" */ 1469 case 44: /* 32 nm nehalem, "Gulftown" */
1470 case 47: /* 32 nm Xeon E7 */
1055 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1471 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1056 sizeof(hw_cache_event_ids)); 1472 sizeof(hw_cache_event_ids));
1473 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1474 sizeof(hw_cache_extra_regs));
1057 1475
1058 intel_pmu_lbr_init_nhm(); 1476 intel_pmu_lbr_init_nhm();
1059 1477
1060 x86_pmu.event_constraints = intel_westmere_event_constraints; 1478 x86_pmu.event_constraints = intel_westmere_event_constraints;
1479 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1061 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1480 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1481 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1482 x86_pmu.extra_regs = intel_westmere_extra_regs;
1062 pr_cont("Westmere events, "); 1483 pr_cont("Westmere events, ");
1063 break; 1484 break;
1064 1485
1486 case 42: /* SandyBridge */
1487 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1488 sizeof(hw_cache_event_ids));
1489
1490 intel_pmu_lbr_init_nhm();
1491
1492 x86_pmu.event_constraints = intel_snb_event_constraints;
1493 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1494 pr_cont("SandyBridge events, ");
1495 break;
1496
1065 default: 1497 default:
1066 /* 1498 /*
1067 * default constraints for v2 and up 1499 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f2b8a0..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -361,30 +361,70 @@ static int intel_pmu_drain_bts_buffer(void)
361/* 361/*
362 * PEBS 362 * PEBS
363 */ 363 */
364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
368 INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
370 EVENT_CONSTRAINT_END
371};
372
373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
377 EVENT_CONSTRAINT_END
378};
364 379
365static struct event_constraint intel_core_pebs_events[] = { 380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
366 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ 381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
367 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
368 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
369 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
370 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
371 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
372 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 387 INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
373 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
374 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 389 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
375 EVENT_CONSTRAINT_END 392 EVENT_CONSTRAINT_END
376}; 393};
377 394
378static struct event_constraint intel_nehalem_pebs_events[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
379 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
380 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
381 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
382 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
383 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
384 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 401 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
385 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 402 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
386 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 403 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
387 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 404 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
405 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
406 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
407 EVENT_CONSTRAINT_END
408};
409
410static struct event_constraint intel_snb_pebs_events[] = {
411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
388 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
389}; 429};
390 430
@@ -695,20 +735,17 @@ static void intel_ds_init(void)
695 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 735 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
696 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 736 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
697 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 737 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
698 x86_pmu.pebs_constraints = intel_core_pebs_events;
699 break; 738 break;
700 739
701 case 1: 740 case 1:
702 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 741 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
703 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 742 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
704 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 743 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
705 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
706 break; 744 break;
707 745
708 default: 746 default:
709 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 747 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
710 x86_pmu.pebs = 0; 748 x86_pmu.pebs = 0;
711 break;
712 } 749 }
713 } 750 }
714} 751}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index f7a0993c1e7c..e93fcd55fae1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -679,7 +679,7 @@ static int p4_validate_raw_event(struct perf_event *event)
679 */ 679 */
680 680
681 /* 681 /*
682 * if an event is shared accross the logical threads 682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it 683 * the user needs special permissions to be able to use it
684 */ 684 */
685 if (p4_ht_active() && p4_event_bind_map[v].shared) { 685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
@@ -764,15 +764,21 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
764 u64 v; 764 u64 v;
765 765
766 /* an official way for overflow indication */ 766 /* an official way for overflow indication */
767 rdmsrl(hwc->config_base + hwc->idx, v); 767 rdmsrl(hwc->config_base, v);
768 if (v & P4_CCCR_OVF) { 768 if (v & P4_CCCR_OVF) {
769 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
770 return 1; 770 return 1;
771 } 771 }
772 772
773 /* it might be unflagged overflow */ 773 /*
774 rdmsrl(hwc->event_base + hwc->idx, v); 774 * In some circumstances the overflow might issue an NMI but did
775 if (!(v & ARCH_P4_CNTRVAL_MASK)) 775 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
776 * we simply check for high bit being set, if it's cleared it means
777 * the counter has reached zero value and continued counting before
778 * real NMI signal was received:
779 */
780 rdmsrl(hwc->event_base, v);
781 if (!(v & ARCH_P4_UNFLAGGED_BIT))
776 return 1; 782 return 1;
777 783
778 return 0; 784 return 0;
@@ -785,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
785 * 791 *
786 * It's still allowed that two threads setup same cache 792 * It's still allowed that two threads setup same cache
787 * events so we can't simply clear metrics until we knew 793 * events so we can't simply clear metrics until we knew
788 * noone is depending on us, so we need kind of counter 794 * no one is depending on us, so we need kind of counter
789 * for "ReplayEvent" users. 795 * for "ReplayEvent" users.
790 * 796 *
791 * What is more complex -- RAW events, if user (for some 797 * What is more complex -- RAW events, if user (for some
792 * reason) will pass some cache event metric with improper 798 * reason) will pass some cache event metric with improper
793 * event opcode -- it's fine from hardware point of view 799 * event opcode -- it's fine from hardware point of view
794 * but completely nonsence from "meaning" of such action. 800 * but completely nonsense from "meaning" of such action.
795 * 801 *
796 * So at moment let leave metrics turned on forever -- it's 802 * So at moment let leave metrics turned on forever -- it's
797 * ok for now but need to be revisited! 803 * ok for now but need to be revisited!
@@ -810,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
810 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 816 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
811 * asserted again and again 817 * asserted again and again
812 */ 818 */
813 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 819 (void)checking_wrmsrl(hwc->config_base,
814 (u64)(p4_config_unpack_cccr(hwc->config)) & 820 (u64)(p4_config_unpack_cccr(hwc->config)) &
815 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 821 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
816} 822}
@@ -880,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
880 p4_pmu_enable_pebs(hwc->config); 886 p4_pmu_enable_pebs(hwc->config);
881 887
882 (void)checking_wrmsrl(escr_addr, escr_conf); 888 (void)checking_wrmsrl(escr_addr, escr_conf);
883 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 889 (void)checking_wrmsrl(hwc->config_base,
884 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 890 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
885} 891}
886 892
@@ -941,14 +947,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
941 if (!x86_perf_event_set_period(event)) 947 if (!x86_perf_event_set_period(event))
942 continue; 948 continue;
943 if (perf_event_overflow(event, 1, &data, regs)) 949 if (perf_event_overflow(event, 1, &data, regs))
944 p4_pmu_disable_event(event); 950 x86_pmu_stop(event, 0);
945 } 951 }
946 952
947 if (handled) { 953 if (handled)
948 /* p4 quirk: unmask it again */
949 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
950 inc_irq_stat(apic_perf_irqs); 954 inc_irq_stat(apic_perf_irqs);
951 } 955
956 /*
957 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
958 * been observed that the OVF bit flag has to be cleared first _before_
959 * the LVTPC can be unmasked.
960 *
961 * The reason is the NMI line will continue to be asserted while the OVF
962 * bit is set. This causes a second NMI to generate if the LVTPC is
963 * unmasked before the OVF bit is cleared, leading to unknown NMI
964 * messages.
965 */
966 apic_write(APIC_LVTPC, APIC_DM_NMI);
952 967
953 return handled; 968 return handled;
954} 969}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d5a236615501..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
46 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
47 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
48 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
49 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
50 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
51 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
70 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
71 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
72 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
73 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
74 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
75 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index d5cd13945d5a..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
14 14
15static void *kdump_buf_page; 15static void *kdump_buf_page;
16 16
17/* Stores the physical address of elf header of crash image. */
18unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
19
20static inline bool is_crashed_pfn_valid(unsigned long pfn) 17static inline bool is_crashed_pfn_valid(unsigned long pfn)
21{ 18{
22#ifndef CONFIG_X86_PAE 19#ifndef CONFIG_X86_PAE
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 994828899e09..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
16/** 13/**
17 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 15 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..e90f08458e6b
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,439 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16
17#include <asm/hpet.h>
18#include <asm/irq_controller.h>
19#include <asm/apic.h>
20#include <asm/pci_x86.h>
21
22__initdata u64 initial_dtb;
23char __initdata cmd_line[COMMAND_LINE_SIZE];
24static LIST_HEAD(irq_domains);
25static DEFINE_RAW_SPINLOCK(big_irq_lock);
26
27int __initdata of_ioapic;
28
29#ifdef CONFIG_X86_IO_APIC
30static void add_interrupt_host(struct irq_domain *ih)
31{
32 unsigned long flags;
33
34 raw_spin_lock_irqsave(&big_irq_lock, flags);
35 list_add(&ih->l, &irq_domains);
36 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
37}
38#endif
39
40static struct irq_domain *get_ih_from_node(struct device_node *controller)
41{
42 struct irq_domain *ih, *found = NULL;
43 unsigned long flags;
44
45 raw_spin_lock_irqsave(&big_irq_lock, flags);
46 list_for_each_entry(ih, &irq_domains, l) {
47 if (ih->controller == controller) {
48 found = ih;
49 break;
50 }
51 }
52 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
53 return found;
54}
55
56unsigned int irq_create_of_mapping(struct device_node *controller,
57 const u32 *intspec, unsigned int intsize)
58{
59 struct irq_domain *ih;
60 u32 virq, type;
61 int ret;
62
63 ih = get_ih_from_node(controller);
64 if (!ih)
65 return 0;
66 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
67 if (ret)
68 return 0;
69 if (type == IRQ_TYPE_NONE)
70 return virq;
71 irq_set_irq_type(virq, type);
72 return virq;
73}
74EXPORT_SYMBOL_GPL(irq_create_of_mapping);
75
76unsigned long pci_address_to_pio(phys_addr_t address)
77{
78 /*
79 * The ioport address can be directly used by inX / outX
80 */
81 BUG_ON(address >= (1 << 16));
82 return (unsigned long)address;
83}
84EXPORT_SYMBOL_GPL(pci_address_to_pio);
85
86void __init early_init_dt_scan_chosen_arch(unsigned long node)
87{
88 BUG();
89}
90
91void __init early_init_dt_add_memory_arch(u64 base, u64 size)
92{
93 BUG();
94}
95
96void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
97{
98 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
99}
100
101void __init add_dtb(u64 data)
102{
103 initial_dtb = data + offsetof(struct setup_data, data);
104}
105
106/*
107 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
108 */
109static struct of_device_id __initdata ce4100_ids[] = {
110 { .compatible = "intel,ce4100-cp", },
111 { .compatible = "isa", },
112 { .compatible = "pci", },
113 {},
114};
115
116static int __init add_bus_probe(void)
117{
118 if (!of_have_populated_dt())
119 return 0;
120
121 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
122}
123module_init(add_bus_probe);
124
125#ifdef CONFIG_PCI
126static int x86_of_pci_irq_enable(struct pci_dev *dev)
127{
128 struct of_irq oirq;
129 u32 virq;
130 int ret;
131 u8 pin;
132
133 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
134 if (ret)
135 return ret;
136 if (!pin)
137 return 0;
138
139 ret = of_irq_map_pci(dev, &oirq);
140 if (ret)
141 return ret;
142
143 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
144 oirq.size);
145 if (virq == 0)
146 return -EINVAL;
147 dev->irq = virq;
148 return 0;
149}
150
151static void x86_of_pci_irq_disable(struct pci_dev *dev)
152{
153}
154
155void __cpuinit x86_of_pci_init(void)
156{
157 struct device_node *np;
158
159 pcibios_enable_irq = x86_of_pci_irq_enable;
160 pcibios_disable_irq = x86_of_pci_irq_disable;
161
162 for_each_node_by_type(np, "pci") {
163 const void *prop;
164 struct pci_bus *bus;
165 unsigned int bus_min;
166 struct device_node *child;
167
168 prop = of_get_property(np, "bus-range", NULL);
169 if (!prop)
170 continue;
171 bus_min = be32_to_cpup(prop);
172
173 bus = pci_find_bus(0, bus_min);
174 if (!bus) {
175 printk(KERN_ERR "Can't find a node for bus %s.\n",
176 np->full_name);
177 continue;
178 }
179
180 if (bus->self)
181 bus->self->dev.of_node = np;
182 else
183 bus->dev.of_node = np;
184
185 for_each_child_of_node(np, child) {
186 struct pci_dev *dev;
187 u32 devfn;
188
189 prop = of_get_property(child, "reg", NULL);
190 if (!prop)
191 continue;
192
193 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
194 dev = pci_get_slot(bus, devfn);
195 if (!dev)
196 continue;
197 dev->dev.of_node = child;
198 pci_dev_put(dev);
199 }
200 }
201}
202#endif
203
204static void __init dtb_setup_hpet(void)
205{
206#ifdef CONFIG_HPET_TIMER
207 struct device_node *dn;
208 struct resource r;
209 int ret;
210
211 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
212 if (!dn)
213 return;
214 ret = of_address_to_resource(dn, 0, &r);
215 if (ret) {
216 WARN_ON(1);
217 return;
218 }
219 hpet_address = r.start;
220#endif
221}
222
223static void __init dtb_lapic_setup(void)
224{
225#ifdef CONFIG_X86_LOCAL_APIC
226 struct device_node *dn;
227 struct resource r;
228 int ret;
229
230 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
231 if (!dn)
232 return;
233
234 ret = of_address_to_resource(dn, 0, &r);
235 if (WARN_ON(ret))
236 return;
237
238 /* Did the boot loader setup the local APIC ? */
239 if (!cpu_has_apic) {
240 if (apic_force_enable(r.start))
241 return;
242 }
243 smp_found_config = 1;
244 pic_mode = 1;
245 register_lapic_address(r.start);
246 generic_processor_info(boot_cpu_physical_apicid,
247 GET_APIC_VERSION(apic_read(APIC_LVR)));
248#endif
249}
250
251#ifdef CONFIG_X86_IO_APIC
252static unsigned int ioapic_id;
253
254static void __init dtb_add_ioapic(struct device_node *dn)
255{
256 struct resource r;
257 int ret;
258
259 ret = of_address_to_resource(dn, 0, &r);
260 if (ret) {
261 printk(KERN_ERR "Can't obtain address from node %s.\n",
262 dn->full_name);
263 return;
264 }
265 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
266}
267
268static void __init dtb_ioapic_setup(void)
269{
270 struct device_node *dn;
271
272 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
273 dtb_add_ioapic(dn);
274
275 if (nr_ioapics) {
276 of_ioapic = 1;
277 return;
278 }
279 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
280}
281#else
282static void __init dtb_ioapic_setup(void) {}
283#endif
284
285static void __init dtb_apic_setup(void)
286{
287 dtb_lapic_setup();
288 dtb_ioapic_setup();
289}
290
291#ifdef CONFIG_OF_FLATTREE
292static void __init x86_flattree_get_config(void)
293{
294 u32 size, map_len;
295 void *new_dtb;
296
297 if (!initial_dtb)
298 return;
299
300 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
301 (u64)sizeof(struct boot_param_header));
302
303 initial_boot_params = early_memremap(initial_dtb, map_len);
304 size = be32_to_cpu(initial_boot_params->totalsize);
305 if (map_len < size) {
306 early_iounmap(initial_boot_params, map_len);
307 initial_boot_params = early_memremap(initial_dtb, size);
308 map_len = size;
309 }
310
311 new_dtb = alloc_bootmem(size);
312 memcpy(new_dtb, initial_boot_params, size);
313 early_iounmap(initial_boot_params, map_len);
314
315 initial_boot_params = new_dtb;
316
317 /* root level address cells */
318 of_scan_flat_dt(early_init_dt_scan_root, NULL);
319
320 unflatten_device_tree();
321}
322#else
323static inline void x86_flattree_get_config(void) { }
324#endif
325
326void __init x86_dtb_init(void)
327{
328 x86_flattree_get_config();
329
330 if (!of_have_populated_dt())
331 return;
332
333 dtb_setup_hpet();
334 dtb_apic_setup();
335}
336
337#ifdef CONFIG_X86_IO_APIC
338
339struct of_ioapic_type {
340 u32 out_type;
341 u32 trigger;
342 u32 polarity;
343};
344
345static struct of_ioapic_type of_ioapic_type[] =
346{
347 {
348 .out_type = IRQ_TYPE_EDGE_RISING,
349 .trigger = IOAPIC_EDGE,
350 .polarity = 1,
351 },
352 {
353 .out_type = IRQ_TYPE_LEVEL_LOW,
354 .trigger = IOAPIC_LEVEL,
355 .polarity = 0,
356 },
357 {
358 .out_type = IRQ_TYPE_LEVEL_HIGH,
359 .trigger = IOAPIC_LEVEL,
360 .polarity = 1,
361 },
362 {
363 .out_type = IRQ_TYPE_EDGE_FALLING,
364 .trigger = IOAPIC_EDGE,
365 .polarity = 0,
366 },
367};
368
369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
370 u32 *out_hwirq, u32 *out_type)
371{
372 struct io_apic_irq_attr attr;
373 struct of_ioapic_type *it;
374 u32 line, idx, type;
375
376 if (intsize < 2)
377 return -EINVAL;
378
379 line = *intspec;
380 idx = (u32) id->priv;
381 *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
382
383 intspec++;
384 type = *intspec;
385
386 if (type >= ARRAY_SIZE(of_ioapic_type))
387 return -EINVAL;
388
389 it = of_ioapic_type + type;
390 *out_type = it->out_type;
391
392 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
393
394 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
395}
396
397static void __init ioapic_add_ofnode(struct device_node *np)
398{
399 struct resource r;
400 int i, ret;
401
402 ret = of_address_to_resource(np, 0, &r);
403 if (ret) {
404 printk(KERN_ERR "Failed to obtain address for %s\n",
405 np->full_name);
406 return;
407 }
408
409 for (i = 0; i < nr_ioapics; i++) {
410 if (r.start == mp_ioapics[i].apicaddr) {
411 struct irq_domain *id;
412
413 id = kzalloc(sizeof(*id), GFP_KERNEL);
414 BUG_ON(!id);
415 id->controller = np;
416 id->xlate = ioapic_xlate;
417 id->priv = (void *)i;
418 add_interrupt_host(id);
419 return;
420 }
421 }
422 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
423}
424
425void __init x86_add_irq_domains(void)
426{
427 struct device_node *dp;
428
429 if (!of_have_populated_dt())
430 return;
431
432 for_each_node_with_property(dp, "interrupt-controller") {
433 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
434 ioapic_add_ofnode(dp);
435 }
436}
437#else
438void __init x86_add_irq_domains(void) { }
439#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1b..e2a3f0606da4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pS\n", (void *) address, 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 reliable ? "" : "? ", (void *) address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, char *log_lvl) 178 unsigned long *stack, unsigned long bp, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack) 185 unsigned long *stack, unsigned long bp)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, ""); 187 show_trace_log_lvl(task, regs, stack, bp, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, ""); 192 show_stack_log_lvl(task, NULL, sp, 0, "");
193} 193}
194 194
195/* 195/*
@@ -197,14 +197,16 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp;
200 unsigned long stack; 201 unsigned long stack;
201 202
203 bp = stack_frame(current, NULL);
202 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
203 current->pid, current->comm, print_tainted(), 205 current->pid, current->comm, print_tainted(),
204 init_utsname()->release, 206 init_utsname()->release,
205 (int)strcspn(init_utsname()->version, " "), 207 (int)strcspn(init_utsname()->version, " "),
206 init_utsname()->version); 208 init_utsname()->version);
207 show_trace(NULL, NULL, &stack); 209 show_trace(NULL, NULL, &stack, bp);
208} 210}
209EXPORT_SYMBOL(dump_stack); 211EXPORT_SYMBOL(dump_stack);
210 212
@@ -320,41 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err)
320 oops_end(flags, regs, sig); 322 oops_end(flags, regs, sig);
321} 323}
322 324
323void notrace __kprobes
324die_nmi(char *str, struct pt_regs *regs, int do_panic)
325{
326 unsigned long flags;
327
328 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
329 return;
330
331 /*
332 * We are in trouble anyway, lets at least try
333 * to get a message out.
334 */
335 flags = oops_begin();
336 printk(KERN_EMERG "%s", str);
337 printk(" on CPU%d, ip %08lx, registers:\n",
338 smp_processor_id(), regs->ip);
339 show_registers(regs);
340 oops_end(flags, regs, 0);
341 if (do_panic || panic_on_oops)
342 panic("Non maskable interrupt");
343 nmi_exit();
344 local_irq_enable();
345 do_exit(SIGBUS);
346}
347
348static int __init oops_setup(char *s)
349{
350 if (!s)
351 return -EINVAL;
352 if (!strcmp(s, "panic"))
353 panic_on_oops = 1;
354 return 0;
355}
356early_param("oops", oops_setup);
357
358static int __init kstack_setup(char *s) 325static int __init kstack_setup(char *s)
359{ 326{
360 if (!s) 327 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 74cc1eda384b..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,12 +17,11 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
21 struct pt_regs *regs, unsigned long *stack, 21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
26 25
27 if (!task) 26 if (!task)
28 task = current; 27 task = current;
@@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task,
35 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
36 } 35 }
37 36
38 bp = stack_frame(task, regs); 37 if (!bp)
38 bp = stack_frame(task, regs);
39
39 for (;;) { 40 for (;;) {
40 struct thread_info *context; 41 struct thread_info *context;
41 42
@@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace);
55 56
56void 57void
57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 58show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
58 unsigned long *sp, char *log_lvl) 59 unsigned long *sp, unsigned long bp, char *log_lvl)
59{ 60{
60 unsigned long *stack; 61 unsigned long *stack;
61 int i; 62 int i;
@@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
77 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
78 } 79 }
79 printk(KERN_CONT "\n"); 80 printk(KERN_CONT "\n");
80 show_trace_log_lvl(task, regs, sp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
81} 82}
82 83
83 84
@@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs)
102 u8 *ip; 103 u8 *ip;
103 104
104 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG); 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
106 107
107 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
108 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a6b6fcf7f0ae..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, 142void dump_trace(struct task_struct *task, struct pt_regs *regs,
143 struct pt_regs *regs, unsigned long *stack, 143 unsigned long *stack, unsigned long bp,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -150,7 +150,6 @@ void dump_trace(struct task_struct *task,
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy; 152 unsigned long dummy;
153 unsigned long bp;
154 153
155 if (!task) 154 if (!task)
156 task = current; 155 task = current;
@@ -161,7 +160,8 @@ void dump_trace(struct task_struct *task,
161 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
162 } 161 }
163 162
164 bp = stack_frame(task, regs); 163 if (!bp)
164 bp = stack_frame(task, regs);
165 /* 165 /*
166 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
167 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
225 225
226void 226void
227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 unsigned long *sp, char *log_lvl) 228 unsigned long *sp, unsigned long bp, char *log_lvl)
229{ 229{
230 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
231 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
269 preempt_enable(); 269 preempt_enable();
270 270
271 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
272 show_trace_log_lvl(task, regs, sp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
273} 273}
274 274
275void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs)
298 298
299 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
301 KERN_EMERG); 301 0, KERN_EMERG);
302 302
303 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
304 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26da0c0c..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <linux/pfn.h> 16#include <linux/pfn.h>
16#include <linux/suspend.h> 17#include <linux/suspend.h>
@@ -667,21 +668,15 @@ __init void e820_setup_gap(void)
667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 668 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
668 * linked list of struct setup_data, which is parsed here. 669 * linked list of struct setup_data, which is parsed here.
669 */ 670 */
670void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 671void __init parse_e820_ext(struct setup_data *sdata)
671{ 672{
672 u32 map_len;
673 int entries; 673 int entries;
674 struct e820entry *extmap; 674 struct e820entry *extmap;
675 675
676 entries = sdata->len / sizeof(struct e820entry); 676 entries = sdata->len / sizeof(struct e820entry);
677 map_len = sdata->len + sizeof(struct setup_data);
678 if (map_len > PAGE_SIZE)
679 sdata = early_ioremap(pa_data, map_len);
680 extmap = (struct e820entry *)(sdata->data); 677 extmap = (struct e820entry *)(sdata->data);
681 __append_e820_map(extmap, entries); 678 __append_e820_map(extmap, entries);
682 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 679 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
683 if (map_len > PAGE_SIZE)
684 early_iounmap(sdata, map_len);
685 printk(KERN_INFO "extended physical RAM map:\n"); 680 printk(KERN_INFO "extended physical RAM map:\n");
686 e820_print_map("extended"); 681 e820_print_map("extended");
687} 682}
@@ -847,15 +842,21 @@ static int __init parse_memopt(char *p)
847 if (!p) 842 if (!p)
848 return -EINVAL; 843 return -EINVAL;
849 844
850#ifdef CONFIG_X86_32
851 if (!strcmp(p, "nopentium")) { 845 if (!strcmp(p, "nopentium")) {
846#ifdef CONFIG_X86_32
852 setup_clear_cpu_cap(X86_FEATURE_PSE); 847 setup_clear_cpu_cap(X86_FEATURE_PSE);
853 return 0; 848 return 0;
854 } 849#else
850 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
851 return -EINVAL;
855#endif 852#endif
853 }
856 854
857 userdef = 1; 855 userdef = 1;
858 mem_size = memparse(p, &p); 856 mem_size = memparse(p, &p);
857 /* don't remove all of memory when handling "mem={invalid}" param */
858 if (mem_size == 0)
859 return -EINVAL;
859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 860 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
860 861
861 return 0; 862 return 0;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -143,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func)
143 143
144static u32 __init ati_sbx00_rev(int num, int slot, int func) 144static u32 __init ati_sbx00_rev(int num, int slot, int func)
145{ 145{
146 u32 old, d; 146 u32 d;
147 147
148 d = read_pci_config(num, slot, func, 0x70);
149 old = d;
150 d &= ~(1<<8);
151 write_pci_config(num, slot, func, 0x70, d);
152 d = read_pci_config(num, slot, func, 0x8); 148 d = read_pci_config(num, slot, func, 0x8);
153 d &= 0xff; 149 d &= 0xff;
154 write_pci_config(num, slot, func, 0x70, old);
155 150
156 return d; 151 return d;
157} 152}
@@ -160,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
160{ 155{
161 u32 d, rev; 156 u32 d, rev;
162 157
163 if (acpi_use_timer_override) 158 rev = ati_sbx00_rev(num, slot, func);
159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1;
161
162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
164 return; 168 return;
165 169
166 rev = ati_sbx00_rev(num, slot, func); 170 if (acpi_use_timer_override)
167 if (rev > 0x13)
168 return; 171 return;
169 172
170 /* check for IRQ0 interrupt swap */ 173 /* check for IRQ0 interrupt swap */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efad7ebb..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -395,7 +397,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 399 */
398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
400 402
401 pushl_cfi %eax 403 pushl_cfi %eax
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
788 */ 790 */
789.section .init.rodata,"a" 791.section .init.rodata,"a"
790ENTRY(interrupt) 792ENTRY(interrupt)
791.text 793.section .entry.text, "ax"
792 .p2align 5 794 .p2align 5
793 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
794ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
807 .endif 809 .endif
808 .previous 810 .previous
809 .long 1b 811 .long 1b
810 .text 812 .section .entry.text, "ax"
811vector=vector+1 813vector=vector+1
812 .endif 814 .endif
813 .endr 815 .endr
@@ -1409,11 +1411,10 @@ END(general_protection)
1409#ifdef CONFIG_KVM_GUEST 1411#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault) 1412ENTRY(async_page_fault)
1411 RING0_EC_FRAME 1413 RING0_EC_FRAME
1412 pushl $do_async_page_fault 1414 pushl_cfi $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code 1415 jmp error_code
1415 CFI_ENDPROC 1416 CFI_ENDPROC
1416END(apf_page_fault) 1417END(async_page_fault)
1417#endif 1418#endif
1418 1419
1419/* 1420/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffbeb0c9..8a445a0c989e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -420,7 +422,7 @@ ENTRY(ret_from_fork)
420END(ret_from_fork) 422END(ret_from_fork)
421 423
422/* 424/*
423 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
424 * 426 *
425 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
426 * stack pointer. 428 * stack pointer.
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
744 */ 746 */
745 .section .init.rodata,"a" 747 .section .init.rodata,"a"
746ENTRY(interrupt) 748ENTRY(interrupt)
747 .text 749 .section .entry.text
748 .p2align 5 750 .p2align 5
749 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
750ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
763 .endif 765 .endif
764 .previous 766 .previous
765 .quad 1b 767 .quad 1b
766 .text 768 .section .entry.text
767vector=vector+1 769vector=vector+1
768 .endif 770 .endif
769 .endr 771 .endr
@@ -975,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
975 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
976 978
977#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
978.irpc idx, "01234567" 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
982.if NUM_INVALIDATE_TLB_VECTORS > \idx
979apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
980 invalidate_interrupt\idx smp_invalidate_interrupt 984 invalidate_interrupt\idx smp_invalidate_interrupt
985.endif
981.endr 986.endr
982#endif 987#endif
983 988
@@ -1248,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1248 decl PER_CPU_VAR(irq_count) 1253 decl PER_CPU_VAR(irq_count)
1249 jmp error_exit 1254 jmp error_exit
1250 CFI_ENDPROC 1255 CFI_ENDPROC
1251END(do_hypervisor_callback) 1256END(xen_do_hypervisor_callback)
1252 1257
1253/* 1258/*
1254 * Hypervisor uses this for application faults while it executes. 1259 * Hypervisor uses this for application faults while it executes.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 382eb2936d4d..a93742a57468 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
437 return; 437 return;
438 } 438 }
439 439
440 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
441 frame_pointer) == -EBUSY) {
442 *parent = old;
443 return;
444 }
445
446 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
447 442
448 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
449 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
450 current->curr_ret_stack--;
451 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
452 } 453 }
453} 454}
454#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7f138b3c3c52..d6d6bb361931 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,15 +34,6 @@ void __init i386_start_kernel(void)
34{ 34{
35 memblock_init(); 35 memblock_init();
36 36
37#ifdef CONFIG_X86_TRAMPOLINE
38 /*
39 * But first pinch a few for the stack/trampoline stuff
40 * FIXME: Don't need the extra page at 4K, but need to fix
41 * trampoline before removing it. (see the GDT stuff)
42 */
43 memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
44#endif
45
46 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 37 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
47 38
48#ifdef CONFIG_BLK_DEV_INITRD 39#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d2673c28aff..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
77 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
78 zap_identity_mappings(); 78 zap_identity_mappings();
79 79
80 /* Cleanup the over mapped high alias */
81 cleanup_highmap();
82
83 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
84 81
85 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 767d6c43de37..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
73 */ 73 */
74KERNEL_PAGES = LOWMEM_PAGES 74KERNEL_PAGES = LOWMEM_PAGES
75 75
76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
77RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
78 78
79/* 79/*
@@ -137,7 +137,7 @@ ENTRY(startup_32)
137 movsl 137 movsl
1381: 1381:
139 139
140#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
141 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
142 movl %cr3, %eax 142 movl %cr3, %eax
143 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
@@ -623,7 +623,7 @@ ENTRY(initial_code)
623 * BSS section 623 * BSS section
624 */ 624 */
625__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
626 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
627#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
628initial_pg_pmd: 628initial_pg_pmd:
629 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
@@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir)
644#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
645__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
648ENTRY(initial_page_table) 648ENTRY(initial_page_table)
649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 650# if KPMDS == 3
@@ -662,7 +662,7 @@ ENTRY(initial_page_table)
662# else 662# else
663# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
664# endif 664# endif
665 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
666#endif 666#endif
667 667
668.data 668.data
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968f12d2..bfe8f729e086 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
503 if (!irq) 503 if (!irq)
504 return -EINVAL; 504 return -EINVAL;
505 505
506 set_irq_data(irq, dev); 506 irq_set_handler_data(irq, dev);
507 507
508 if (hpet_setup_msi_irq(irq)) 508 if (hpet_setup_msi_irq(irq))
509 return -EINVAL; 509 return -EINVAL;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e60c38cc0eed..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sysdev.h> 13#include <linux/syscore_ops.h>
14 14
15#include <asm/dma.h> 15#include <asm/dma.h>
16 16
@@ -21,7 +21,7 @@
21 * in asm/dma.h. 21 * in asm/dma.h.
22 */ 22 */
23 23
24static int i8237A_resume(struct sys_device *dev) 24static void i8237A_resume(void)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
27 int i; 27 int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
41 enable_dma(4); 41 enable_dma(4);
42 42
43 release_dma_lock(flags); 43 release_dma_lock(flags);
44
45 return 0;
46} 44}
47 45
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state) 46static struct syscore_ops i8237_syscore_ops = {
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 .name = "i8237",
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 47 .resume = i8237A_resume,
57}; 48};
58 49
59static struct sys_device device_i8237A = { 50static int __init i8237A_init_ops(void)
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{ 51{
66 int error = sysdev_class_register(&i8237_sysdev_class); 52 register_syscore_ops(&i8237_syscore_ops);
67 if (!error) 53 return 0;
68 error = sysdev_register(&device_i8237A);
69 return error;
70} 54}
71device_initcall(i8237A_init_sysfs); 55device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 212fe6590aab..577e90cadaeb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -117,79 +117,6 @@ void __init setup_pit_timer(void)
117} 117}
118 118
119#ifndef CONFIG_X86_64 119#ifndef CONFIG_X86_64
120/*
121 * Since the PIT overflows every tick, its not very useful
122 * to just read by itself. So use jiffies to emulate a free
123 * running counter:
124 */
125static cycle_t pit_read(struct clocksource *cs)
126{
127 static int old_count;
128 static u32 old_jifs;
129 unsigned long flags;
130 int count;
131 u32 jifs;
132
133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /*
135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine
137 * by having side effects on state that we cannot undo if
138 * there is a collision on the seqlock and our caller has to
139 * retry. (Namely, old_jifs and old_count.) So we must treat
140 * jiffies as volatile despite the lock. We read jiffies
141 * before latching the timer count to guarantee that although
142 * the jiffies value might be older than the count (that is,
143 * the counter may underflow between the last point where
144 * jiffies was incremented and the point where we latch the
145 * count), it cannot be newer.
146 */
147 jifs = jiffies;
148 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
149 count = inb_pit(PIT_CH0); /* read the latched count */
150 count |= inb_pit(PIT_CH0) << 8;
151
152 /* VIA686a test code... reset the latch if count > max + 1 */
153 if (count > LATCH) {
154 outb_pit(0x34, PIT_MODE);
155 outb_pit(LATCH & 0xff, PIT_CH0);
156 outb_pit(LATCH >> 8, PIT_CH0);
157 count = LATCH - 1;
158 }
159
160 /*
161 * It's possible for count to appear to go the wrong way for a
162 * couple of reasons:
163 *
164 * 1. The timer counter underflows, but we haven't handled the
165 * resulting interrupt and incremented jiffies yet.
166 * 2. Hardware problem with the timer, not giving us continuous time,
167 * the counter does small "jumps" upwards on some Pentium systems,
168 * (see c't 95/10 page 335 for Neptun bug.)
169 *
170 * Previous attempts to handle these cases intelligently were
171 * buggy, so we just do the simple thing now.
172 */
173 if (count > old_count && jifs == old_jifs)
174 count = old_count;
175
176 old_count = count;
177 old_jifs = jifs;
178
179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180
181 count = (LATCH - 1) - count;
182
183 return (cycle_t)(jifs * LATCH) + count;
184}
185
186static struct clocksource pit_cs = {
187 .name = "pit",
188 .rating = 110,
189 .read = pit_read,
190 .mask = CLOCKSOURCE_MASK(32),
191};
192
193static int __init init_pit_clocksource(void) 120static int __init init_pit_clocksource(void)
194{ 121{
195 /* 122 /*
@@ -203,8 +130,7 @@ static int __init init_pit_clocksource(void)
203 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 130 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
204 return 0; 131 return 0;
205 132
206 return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE); 133 return clocksource_i8253_init();
207} 134}
208arch_initcall(init_pit_clocksource); 135arch_initcall(init_pit_clocksource);
209
210#endif /* !CONFIG_X86_64 */ 136#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 20757cb2efa3..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
8#include <linux/random.h> 8#include <linux/random.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <linux/sysdev.h> 11#include <linux/syscore_ops.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/acpi.h> 13#include <linux/acpi.h>
14#include <linux/io.h> 14#include <linux/io.h>
@@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq)
112{ 112{
113 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
114 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
115 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
116 i8259A_chip.name); 116 i8259A_chip.name);
117 enable_irq(irq); 117 enable_irq(irq);
118} 118}
@@ -245,20 +245,19 @@ static void save_ELCR(char *trigger)
245 trigger[1] = inb(0x4d1) & 0xDE; 245 trigger[1] = inb(0x4d1) & 0xDE;
246} 246}
247 247
248static int i8259A_resume(struct sys_device *dev) 248static void i8259A_resume(void)
249{ 249{
250 init_8259A(i8259A_auto_eoi); 250 init_8259A(i8259A_auto_eoi);
251 restore_ELCR(irq_trigger); 251 restore_ELCR(irq_trigger);
252 return 0;
253} 252}
254 253
255static int i8259A_suspend(struct sys_device *dev, pm_message_t state) 254static int i8259A_suspend(void)
256{ 255{
257 save_ELCR(irq_trigger); 256 save_ELCR(irq_trigger);
258 return 0; 257 return 0;
259} 258}
260 259
261static int i8259A_shutdown(struct sys_device *dev) 260static void i8259A_shutdown(void)
262{ 261{
263 /* Put the i8259A into a quiescent state that 262 /* Put the i8259A into a quiescent state that
264 * the kernel initialization code can get it 263 * the kernel initialization code can get it
@@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
266 */ 265 */
267 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
268 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ 267 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
269 return 0;
270} 268}
271 269
272static struct sysdev_class i8259_sysdev_class = { 270static struct syscore_ops i8259_syscore_ops = {
273 .name = "i8259",
274 .suspend = i8259A_suspend, 271 .suspend = i8259A_suspend,
275 .resume = i8259A_resume, 272 .resume = i8259A_resume,
276 .shutdown = i8259A_shutdown, 273 .shutdown = i8259A_shutdown,
277}; 274};
278 275
279static struct sys_device device_i8259A = {
280 .id = 0,
281 .cls = &i8259_sysdev_class,
282};
283
284static void mask_8259A(void) 276static void mask_8259A(void)
285{ 277{
286 unsigned long flags; 278 unsigned long flags;
@@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
399 391
400struct legacy_pic *legacy_pic = &default_legacy_pic; 392struct legacy_pic *legacy_pic = &default_legacy_pic;
401 393
402static int __init i8259A_init_sysfs(void) 394static int __init i8259A_init_ops(void)
403{ 395{
404 int error; 396 if (legacy_pic == &default_legacy_pic)
405 397 register_syscore_ops(&i8259_syscore_ops);
406 if (legacy_pic != &default_legacy_pic)
407 return 0;
408 398
409 error = sysdev_class_register(&i8259_sysdev_class); 399 return 0;
410 if (!error)
411 error = sysdev_register(&device_i8259A);
412 return error;
413} 400}
414 401
415device_initcall(i8259A_init_sysfs); 402device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 387b6a0c9e81..1cb0b9fc78dc 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,6 +8,7 @@
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h>
11 12
12#include <asm/apic.h> 13#include <asm/apic.h>
13#include <asm/io_apic.h> 14#include <asm/io_apic.h>
@@ -44,9 +45,9 @@ void ack_bad_irq(unsigned int irq)
44 45
45#define irq_stats(x) (&per_cpu(irq_stat, x)) 46#define irq_stats(x) (&per_cpu(irq_stat, x))
46/* 47/*
47 * /proc/interrupts printing: 48 * /proc/interrupts printing for arch specific interrupts
48 */ 49 */
49static int show_other_interrupts(struct seq_file *p, int prec) 50int arch_show_interrupts(struct seq_file *p, int prec)
50{ 51{
51 int j; 52 int j;
52 53
@@ -122,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
122 return 0; 123 return 0;
123} 124}
124 125
125int show_interrupts(struct seq_file *p, void *v)
126{
127 unsigned long flags, any_count = 0;
128 int i = *(loff_t *) v, j, prec;
129 struct irqaction *action;
130 struct irq_desc *desc;
131
132 if (i > nr_irqs)
133 return 0;
134
135 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
136 j *= 10;
137
138 if (i == nr_irqs)
139 return show_other_interrupts(p, prec);
140
141 /* print header */
142 if (i == 0) {
143 seq_printf(p, "%*s", prec + 8, "");
144 for_each_online_cpu(j)
145 seq_printf(p, "CPU%-8d", j);
146 seq_putc(p, '\n');
147 }
148
149 desc = irq_to_desc(i);
150 if (!desc)
151 return 0;
152
153 raw_spin_lock_irqsave(&desc->lock, flags);
154 for_each_online_cpu(j)
155 any_count |= kstat_irqs_cpu(i, j);
156 action = desc->action;
157 if (!action && !any_count)
158 goto out;
159
160 seq_printf(p, "%*d: ", prec, i);
161 for_each_online_cpu(j)
162 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
163 seq_printf(p, " %8s", desc->irq_data.chip->name);
164 seq_printf(p, "-%-8s", desc->name);
165
166 if (action) {
167 seq_printf(p, " %s", action->name);
168 while ((action = action->next) != NULL)
169 seq_printf(p, ", %s", action->name);
170 }
171
172 seq_putc(p, '\n');
173out:
174 raw_spin_unlock_irqrestore(&desc->lock, flags);
175 return 0;
176}
177
178/* 126/*
179 * /proc/stat helpers 127 * /proc/stat helpers
180 */ 128 */
@@ -276,15 +224,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
276 224
277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 225EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
278 226
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
288#ifdef CONFIG_HOTPLUG_CPU 227#ifdef CONFIG_HOTPLUG_CPU
289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 228/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
290void fixup_irqs(void) 229void fixup_irqs(void)
@@ -293,6 +232,7 @@ void fixup_irqs(void)
293 static int warned; 232 static int warned;
294 struct irq_desc *desc; 233 struct irq_desc *desc;
295 struct irq_data *data; 234 struct irq_data *data;
235 struct irq_chip *chip;
296 236
297 for_each_irq_desc(irq, desc) { 237 for_each_irq_desc(irq, desc) {
298 int break_affinity = 0; 238 int break_affinity = 0;
@@ -307,10 +247,10 @@ void fixup_irqs(void)
307 /* interrupt's are disabled at this point */ 247 /* interrupt's are disabled at this point */
308 raw_spin_lock(&desc->lock); 248 raw_spin_lock(&desc->lock);
309 249
310 data = &desc->irq_data; 250 data = irq_desc_get_irq_data(desc);
311 affinity = data->affinity; 251 affinity = data->affinity;
312 if (!irq_has_action(irq) || 252 if (!irq_has_action(irq) ||
313 cpumask_equal(affinity, cpu_online_mask)) { 253 cpumask_subset(affinity, cpu_online_mask)) {
314 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
315 continue; 255 continue;
316 } 256 }
@@ -327,16 +267,17 @@ void fixup_irqs(void)
327 affinity = cpu_all_mask; 267 affinity = cpu_all_mask;
328 } 268 }
329 269
330 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) 270 chip = irq_data_get_irq_chip(data);
331 data->chip->irq_mask(data); 271 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
272 chip->irq_mask(data);
332 273
333 if (data->chip->irq_set_affinity) 274 if (chip->irq_set_affinity)
334 data->chip->irq_set_affinity(data, affinity, true); 275 chip->irq_set_affinity(data, affinity, true);
335 else if (!(warned++)) 276 else if (!(warned++))
336 set_affinity = 0; 277 set_affinity = 0;
337 278
338 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) 279 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
339 data->chip->irq_unmask(data); 280 chip->irq_unmask(data);
340 281
341 raw_spin_unlock(&desc->lock); 282 raw_spin_unlock(&desc->lock);
342 283
@@ -368,10 +309,11 @@ void fixup_irqs(void)
368 irq = __this_cpu_read(vector_irq[vector]); 309 irq = __this_cpu_read(vector_irq[vector]);
369 310
370 desc = irq_to_desc(irq); 311 desc = irq_to_desc(irq);
371 data = &desc->irq_data; 312 data = irq_desc_get_irq_data(desc);
313 chip = irq_data_get_irq_chip(data);
372 raw_spin_lock(&desc->lock); 314 raw_spin_lock(&desc->lock);
373 if (data->chip->irq_retrigger) 315 if (chip->irq_retrigger)
374 data->chip->irq_retrigger(data); 316 chip->irq_retrigger(data);
375 raw_spin_unlock(&desc->lock); 317 raw_spin_unlock(&desc->lock);
376 } 318 }
377 } 319 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9974d21048fd..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -172,7 +172,7 @@ asmlinkage void do_softirq(void)
172 172
173 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
174 /* 174 /*
175 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
176 */ 176 */
177 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
178 } 178 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e973958d..f470e4ef993e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -110,7 +113,7 @@ void __init init_ISA_irqs(void)
110 legacy_pic->init(0); 113 legacy_pic->init(0);
111 114
112 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
113 set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
114} 117}
115 118
116void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -118,6 +121,12 @@ void __init init_IRQ(void)
118 int i; 121 int i;
119 122
120 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
121 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
122 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
123 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -164,14 +173,77 @@ static void __init smp_intr_init(void)
164 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
165 174
166 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
167 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
168 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
169 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
170 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
171 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
172 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
173 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
174 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
175 247
176 /* IPI for generic function call */ 248 /* IPI for generic function call */
177 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -243,7 +315,7 @@ void __init native_init_IRQ(void)
243 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 315 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
244 } 316 }
245 317
246 if (!acpi_ioapic) 318 if (!acpi_ioapic && !of_ioapic)
247 setup_irq(2, &irq2); 319 setup_irq(2, &irq2);
248 320
249#ifdef CONFIG_X86_32 321#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a4130005028a..5f9ecff328b5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, 121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
122 dbg_reg_def[regno].size); 122 dbg_reg_def[regno].size);
123 123
124 switch (regno) {
125#ifdef CONFIG_X86_32 124#ifdef CONFIG_X86_32
125 switch (regno) {
126 case GDB_SS: 126 case GDB_SS:
127 if (!user_mode_vm(regs)) 127 if (!user_mode_vm(regs))
128 *(unsigned long *)mem = __KERNEL_DS; 128 *(unsigned long *)mem = __KERNEL_DS;
@@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
135 case GDB_FS: 135 case GDB_FS:
136 *(unsigned long *)mem = 0xFFFF; 136 *(unsigned long *)mem = 0xFFFF;
137 break; 137 break;
138#endif
139 } 138 }
139#endif
140 return dbg_reg_def[regno].name; 140 return dbg_reg_def[regno].name;
141} 141}
142 142
@@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno)
278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
279 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
280 /* 280 /*
281 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
282 * remove failure. 282 * remove failure.
283 */ 283 */
284 return -1; 284 return -1;
@@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
533 } 533 }
534 return NOTIFY_DONE; 534 return NOTIFY_DONE;
535 535
536 case DIE_NMIWATCHDOG:
537 if (atomic_read(&kgdb_active) != -1) {
538 /* KGDB CPU roundup: */
539 kgdb_nmicallback(raw_smp_processor_id(), regs);
540 return NOTIFY_STOP;
541 }
542 /* Enter debugger: */
543 break;
544
545 case DIE_DEBUG: 536 case DIE_DEBUG:
546 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
547 if (user_mode(regs)) 538 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477b3f62..c969fd9d1566 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1277 return 0; 1277 return 0;
1278 1278
1279 /*
1280 * Do not optimize in the entry code due to the unstable
1281 * stack handling.
1282 */
1283 if ((paddr >= (unsigned long )__entry_text_start) &&
1284 (paddr < (unsigned long )__entry_text_end))
1285 return 0;
1286
1279 /* Check there is enough space for a relative jump. */ 1287 /* Check there is enough space for a relative jump. */
1280 if (size - offset < RELATIVEJUMP_SIZE) 1288 if (size - offset < RELATIVEJUMP_SIZE)
1281 return 0; 1289 return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8dc44662394b..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
493 native_smp_prepare_boot_cpu(); 493 native_smp_prepare_boot_cpu();
494} 494}
495 495
496static void kvm_guest_cpu_online(void *dummy) 496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{ 497{
498 kvm_guest_cpu_init(); 498 kvm_guest_cpu_init();
499} 499}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 0fe6d1a66c38..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static void * 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
160{ 158{
161 unsigned int total_size; 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 160 unsigned int max_size, actual_size;
163 void *mc; 161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
164 177
165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); 178 actual_size = buf[4] + (buf[5] << 8);
166 179
167 if (section_hdr[0] != UCODE_UCODE_TYPE) { 180 if (actual_size > size || actual_size > max_size) {
168 pr_err("error: invalid type field in container file section header\n"); 181 pr_err("section size mismatch\n");
169 return NULL; 182 return 0;
170 } 183 }
171 184
172 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 185 return actual_size;
186}
173 187
174 if (total_size > size || total_size > UCODE_MAX_SIZE) { 188static struct microcode_header_amd *
175 pr_err("error: size mismatch\n"); 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
176 return NULL; 190{
191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0;
193
194 if (buf[0] != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n");
196 goto out;
177 } 197 }
178 198
179 mc = vzalloc(UCODE_MAX_SIZE); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
202
203 mc = vzalloc(actual_size);
180 if (!mc) 204 if (!mc)
181 return NULL; 205 goto out;
182 206
183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
185 209
210out:
186 return mc; 211 return mc;
187} 212}
188 213
189static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
190{ 215{
191 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
192 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
193 unsigned long size; 218 unsigned int size = ibuf[2];
194 219
195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
196 221 pr_err("empty section/"
197 size = buf_pos[2]; 222 "invalid type field in container file section header\n");
198 223 return -EINVAL;
199 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
200 pr_err("error: invalid type field in container file section header\n");
201 return 0;
202 } 224 }
203 225
204 equiv_cpu_table = vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
205 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
206 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
207 return 0; 229 return -ENOMEM;
208 } 230 }
209 231
210 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
211 get_ucode_data(equiv_cpu_table, buf, size);
212 233
213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
214} 235}
@@ -223,16 +244,16 @@ static enum ucode_state
223generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
224{ 245{
225 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
226 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
227 void *new_mc = NULL; 251 void *new_mc = NULL;
228 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
229 int new_rev = uci->cpu_sig.rev;
230 unsigned int leftover;
231 unsigned long offset;
232 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
233 254
234 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
235 if (!offset) { 256 if (offset < 0) {
236 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
237 return UCODE_ERROR; 258 return UCODE_ERROR;
238 } 259 }
@@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
241 leftover = size - offset; 262 leftover = size - offset;
242 263
243 while (leftover) { 264 while (leftover) {
244 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
245 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
246
247 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
248 if (!mc)
249 break; 267 break;
250 268
251 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
252 if (get_matching_microcode(cpu, mc, new_rev)) {
253 vfree(new_mc); 270 vfree(new_mc);
254 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
255 new_mc = mc; 272 new_mc = mc_hdr;
256 } else 273 } else
257 vfree(mc); 274 vfree(mc_hdr);
258 275
259 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
260 leftover -= mc_size; 277 leftover -= mc_size;
261 } 278 }
262 279
263 if (new_mc) { 280 if (!new_mc) {
264 if (!leftover) {
265 vfree(uci->mc);
266 uci->mc = new_mc;
267 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
268 cpu, new_rev, uci->cpu_sig.rev);
269 } else {
270 vfree(new_mc);
271 state = UCODE_ERROR;
272 }
273 } else
274 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
275 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
276 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
277 297
278 return state; 298 return state;
279} 299}
280 300
281static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
282{ 302{
283 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
284 const struct firmware *firmware; 304 const struct firmware *fw;
285 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
286 306
287 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
288 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
289 return UCODE_NFOUND; 309 goto out;
290 } 310 }
291 311
292 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
293 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
294 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
295 return UCODE_ERROR; 315 goto fw_release;
296 } 316 }
297 317
298 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
299 319
300 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
301 322
323out:
302 return ret; 324 return ret;
303} 325}
304 326
@@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
319 341
320static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
321 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
322 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
323 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
324 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
325 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 1cca374a2bac..f9242800bc84 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -82,6 +82,7 @@
82#include <linux/cpu.h> 82#include <linux/cpu.h>
83#include <linux/fs.h> 83#include <linux/fs.h>
84#include <linux/mm.h> 84#include <linux/mm.h>
85#include <linux/syscore_ops.h>
85 86
86#include <asm/microcode.h> 87#include <asm/microcode.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -417,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
417 if (err) 418 if (err)
418 return err; 419 return err;
419 420
420 if (microcode_init_cpu(cpu) == UCODE_ERROR) 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
421 err = -EINVAL; 422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
423 return -EINVAL;
424 }
422 425
423 return err; 426 return err;
424} 427}
@@ -436,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
436 return 0; 439 return 0;
437} 440}
438 441
439static int mc_sysdev_resume(struct sys_device *dev) 442static struct sysdev_driver mc_sysdev_driver = {
443 .add = mc_sysdev_add,
444 .remove = mc_sysdev_remove,
445};
446
447/**
448 * mc_bp_resume - Update boot CPU microcode during resume.
449 */
450static void mc_bp_resume(void)
440{ 451{
441 int cpu = dev->id; 452 int cpu = smp_processor_id();
442 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 453 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
443 454
444 if (!cpu_online(cpu))
445 return 0;
446
447 /*
448 * All non-bootup cpus are still disabled,
449 * so only CPU 0 will apply ucode here.
450 *
451 * Moreover, there can be no concurrent
452 * updates from any other places at this point.
453 */
454 WARN_ON(cpu != 0);
455
456 if (uci->valid && uci->mc) 455 if (uci->valid && uci->mc)
457 microcode_ops->apply_microcode(cpu); 456 microcode_ops->apply_microcode(cpu);
458
459 return 0;
460} 457}
461 458
462static struct sysdev_driver mc_sysdev_driver = { 459static struct syscore_ops mc_syscore_ops = {
463 .add = mc_sysdev_add, 460 .resume = mc_bp_resume,
464 .remove = mc_sysdev_remove,
465 .resume = mc_sysdev_resume,
466}; 461};
467 462
468static __cpuinit int 463static __cpuinit int
@@ -540,6 +535,7 @@ static int __init microcode_init(void)
540 if (error) 535 if (error)
541 return error; 536 return error;
542 537
538 register_syscore_ops(&mc_syscore_ops);
543 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
544 540
545 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
@@ -554,6 +550,7 @@ static void __exit microcode_exit(void)
554 microcode_dev_exit(); 550 microcode_dev_exit();
555 551
556 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
553 unregister_syscore_ops(&mc_syscore_ops);
557 554
558 get_online_cpus(); 555 get_online_cpus();
559 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 01b0f6d06451..5a532ce646bf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
714 *nr_m_spare += 1; 714 *nr_m_spare += 1;
715 } 715 }
716} 716}
717#else /* CONFIG_X86_IO_APIC */
718static
719inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
720#endif /* CONFIG_X86_IO_APIC */
721 717
722static int 718static int
723check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
@@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
731 727
732 return ret; 728 return ret;
733} 729}
730#else /* CONFIG_X86_IO_APIC */
731static
732inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
733#endif /* CONFIG_X86_IO_APIC */
734 734
735static int __init replace_intsrc_all(struct mpc_table *mpc, 735static int __init replace_intsrc_all(struct mpc_table *mpc,
736 unsigned long mpc_new_phys, 736 unsigned long mpc_new_phys,
@@ -883,7 +883,7 @@ static int __init update_mp_table(void)
883 883
884 if (!mpc_new_phys) { 884 if (!mpc_new_phys) {
885 unsigned char old, new; 885 unsigned char old, new;
886 /* check if we can change the postion */ 886 /* check if we can change the position */
887 mpc->checksum = 0; 887 mpc->checksum = 0;
888 old = mpf_checksum((unsigned char *)mpc, mpc->length); 888 old = mpf_checksum((unsigned char *)mpc, mpc->length);
889 mpc->checksum = 0xff; 889 mpc->checksum = 0xff;
@@ -892,7 +892,7 @@ static int __init update_mp_table(void)
892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
893 return 0; 893 return 0;
894 } 894 }
895 printk(KERN_INFO "use in-positon replacing\n"); 895 printk(KERN_INFO "use in-position replacing\n");
896 } else { 896 } else {
897 mpf->physptr = mpc_new_phys; 897 mpf->physptr = mpc_new_phys;
898 mpc_new = phys_to_virt(mpc_new_phys); 898 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index f56a117cef68..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1279 1279
1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1281 /* 1281 /*
1282 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1283 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1284 */ 1284 */
1285 return 1; 1285 return 1;
@@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1295 1295
1296/* 1296/*
1297 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1298 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1299 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1300 */ 1300 */
1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c01ffa5b9b87..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,7 +27,7 @@
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/syscore_ops.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <asm/atomic.h> 33#include <asm/atomic.h>
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
81#define AGPEXTERN 81#define AGPEXTERN
82#endif 82#endif
83 83
84/* GART can only remap to physical addresses < 1TB */
85#define GART_MAX_PHYS_ADDR (1ULL << 40)
86
84/* backdoor interface to AGP driver */ 87/* backdoor interface to AGP driver */
85AGPEXTERN int agp_memory_reserved; 88AGPEXTERN int agp_memory_reserved;
86AGPEXTERN __u32 *agp_gatt_table; 89AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
212 size_t size, int dir, unsigned long align_mask) 215 size_t size, int dir, unsigned long align_mask)
213{ 216{
214 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); 217 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
215 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 218 unsigned long iommu_page;
216 int i; 219 int i;
217 220
221 if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
222 return bad_dma_addr;
223
224 iommu_page = alloc_iommu(dev, npages, align_mask);
218 if (iommu_page == -1) { 225 if (iommu_page == -1) {
219 if (!nonforced_iommu(dev, phys_mem, size)) 226 if (!nonforced_iommu(dev, phys_mem, size))
220 return phys_mem; 227 return phys_mem;
@@ -589,7 +596,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
589 aperture_alloc = aper_alloc; 596 aperture_alloc = aper_alloc;
590} 597}
591 598
592static void gart_fixup_northbridges(struct sys_device *dev) 599static void gart_fixup_northbridges(void)
593{ 600{
594 int i; 601 int i;
595 602
@@ -613,33 +620,20 @@ static void gart_fixup_northbridges(struct sys_device *dev)
613 } 620 }
614} 621}
615 622
616static int gart_resume(struct sys_device *dev) 623static void gart_resume(void)
617{ 624{
618 pr_info("PCI-DMA: Resuming GART IOMMU\n"); 625 pr_info("PCI-DMA: Resuming GART IOMMU\n");
619 626
620 gart_fixup_northbridges(dev); 627 gart_fixup_northbridges();
621 628
622 enable_gart_translations(); 629 enable_gart_translations();
623
624 return 0;
625} 630}
626 631
627static int gart_suspend(struct sys_device *dev, pm_message_t state) 632static struct syscore_ops gart_syscore_ops = {
628{
629 return 0;
630}
631
632static struct sysdev_class gart_sysdev_class = {
633 .name = "gart",
634 .suspend = gart_suspend,
635 .resume = gart_resume, 633 .resume = gart_resume,
636 634
637}; 635};
638 636
639static struct sys_device device_gart = {
640 .cls = &gart_sysdev_class,
641};
642
643/* 637/*
644 * Private Northbridge GATT initialization in case we cannot use the 638 * Private Northbridge GATT initialization in case we cannot use the
645 * AGP driver for some reason. 639 * AGP driver for some reason.
@@ -650,7 +644,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
650 unsigned aper_base, new_aper_base; 644 unsigned aper_base, new_aper_base;
651 struct pci_dev *dev; 645 struct pci_dev *dev;
652 void *gatt; 646 void *gatt;
653 int i, error; 647 int i;
654 648
655 pr_info("PCI-DMA: Disabling AGP.\n"); 649 pr_info("PCI-DMA: Disabling AGP.\n");
656 650
@@ -685,12 +679,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
685 679
686 agp_gatt_table = gatt; 680 agp_gatt_table = gatt;
687 681
688 error = sysdev_class_register(&gart_sysdev_class); 682 register_syscore_ops(&gart_syscore_ops);
689 if (!error)
690 error = sysdev_register(&device_gart);
691 if (error)
692 panic("Could not register gart_sysdev -- "
693 "would corrupt data on next suspend");
694 683
695 flush_gart(); 684 flush_gart();
696 685
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff4554198981..d46cbe46b7ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -87,7 +87,7 @@ void exit_thread(void)
87void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
88{ 88{
89 show_registers(regs); 89 show_registers(regs);
90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
91} 91}
92 92
93void show_regs_common(void) 93void show_regs_common(void)
@@ -110,12 +110,9 @@ void show_regs_common(void)
110 init_utsname()->release, 110 init_utsname()->release,
111 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
112 init_utsname()->version); 112 init_utsname()->version);
113 printk(KERN_CONT " "); 113 printk(KERN_CONT " %s %s", vendor, product);
114 printk(KERN_CONT "%s %s", vendor, product); 114 if (board)
115 if (board) { 115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "/");
117 printk(KERN_CONT "%s", board);
118 }
119 printk(KERN_CONT "\n"); 116 printk(KERN_CONT "\n");
120} 117}
121 118
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b4..6c9dd922ac0d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -501,6 +501,10 @@ void set_personality_64bit(void)
501 /* Make sure to be in 64bit mode */ 501 /* Make sure to be in 64bit mode */
502 clear_thread_flag(TIF_IA32); 502 clear_thread_flag(TIF_IA32);
503 503
504 /* Ensure the corresponding mm is not marked. */
505 if (current->mm)
506 current->mm->context.ia32_compat = 0;
507
504 /* TBD: overwrites user setup. Should have two bits. 508 /* TBD: overwrites user setup. Should have two bits.
505 But 64bit processes have always behaved this way, 509 But 64bit processes have always behaved this way,
506 so it's not too bad. The main problem is just that 510 so it's not too bad. The main problem is just that
@@ -516,6 +520,10 @@ void set_personality_ia32(void)
516 set_thread_flag(TIF_IA32); 520 set_thread_flag(TIF_IA32);
517 current->personality |= force_personality32; 521 current->personality |= force_personality32;
518 522
523 /* Mark the associated mm as containing 32-bit tasks. */
524 if (current->mm)
525 current->mm->context.ia32_compat = 1;
526
519 /* Prepare the first "return" to user space */ 527 /* Prepare the first "return" to user space */
520 current_thread_info()->status |= TS_COMPAT; 528 current_thread_info()->status |= TS_COMPAT;
521} 529}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72a..f65e5b521dbd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608 unsigned len, type; 608 unsigned len, type;
609 struct perf_event *bp; 609 struct perf_event *bp;
610 610
611 if (ptrace_get_breakpoints(tsk) < 0)
612 return -ESRCH;
613
611 data &= ~DR_CONTROL_RESERVED; 614 data &= ~DR_CONTROL_RESERVED;
612 old_dr7 = ptrace_get_dr7(thread->ptrace_bps); 615 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
613restore: 616restore:
@@ -655,6 +658,9 @@ restore:
655 } 658 }
656 goto restore; 659 goto restore;
657 } 660 }
661
662 ptrace_put_breakpoints(tsk);
663
658 return ((orig_ret < 0) ? orig_ret : rc); 664 return ((orig_ret < 0) ? orig_ret : rc);
659} 665}
660 666
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
668 674
669 if (n < HBP_NUM) { 675 if (n < HBP_NUM) {
670 struct perf_event *bp; 676 struct perf_event *bp;
677
678 if (ptrace_get_breakpoints(tsk) < 0)
679 return -ESRCH;
680
671 bp = thread->ptrace_bps[n]; 681 bp = thread->ptrace_bps[n];
672 if (!bp) 682 if (!bp)
673 return 0; 683 val = 0;
674 val = bp->hw.info.address; 684 else
685 val = bp->hw.info.address;
686
687 ptrace_put_breakpoints(tsk);
675 } else if (n == 6) { 688 } else if (n == 6) {
676 val = thread->debugreg6; 689 val = thread->debugreg6;
677 } else if (n == 7) { 690 } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
686 struct perf_event *bp; 699 struct perf_event *bp;
687 struct thread_struct *t = &tsk->thread; 700 struct thread_struct *t = &tsk->thread;
688 struct perf_event_attr attr; 701 struct perf_event_attr attr;
702 int err = 0;
703
704 if (ptrace_get_breakpoints(tsk) < 0)
705 return -ESRCH;
689 706
690 if (!t->ptrace_bps[nr]) { 707 if (!t->ptrace_bps[nr]) {
691 ptrace_breakpoint_init(&attr); 708 ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
709 * writing for the user. And anyway this is the previous 726 * writing for the user. And anyway this is the previous
710 * behaviour. 727 * behaviour.
711 */ 728 */
712 if (IS_ERR(bp)) 729 if (IS_ERR(bp)) {
713 return PTR_ERR(bp); 730 err = PTR_ERR(bp);
731 goto put;
732 }
714 733
715 t->ptrace_bps[nr] = bp; 734 t->ptrace_bps[nr] = bp;
716 } else { 735 } else {
717 int err;
718
719 bp = t->ptrace_bps[nr]; 736 bp = t->ptrace_bps[nr];
720 737
721 attr = bp->attr; 738 attr = bp->attr;
722 attr.bp_addr = addr; 739 attr.bp_addr = addr;
723 err = modify_user_hw_breakpoint(bp, &attr); 740 err = modify_user_hw_breakpoint(bp, &attr);
724 if (err)
725 return err;
726 } 741 }
727 742
728 743put:
729 return 0; 744 ptrace_put_breakpoints(tsk);
745 return err;
730} 746}
731 747
732/* 748/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index fc7aae1e2bc7..08c44b08bf5b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/tboot.h> 8#include <linux/tboot.h>
9#include <linux/delay.h>
9#include <acpi/reboot.h> 10#include <acpi/reboot.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
@@ -285,6 +286,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
285 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 286 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
286 }, 287 },
287 }, 288 },
289 { /* Handle problems with rebooting on VersaLogic Menlow boards */
290 .callback = set_bios_reboot,
291 .ident = "VersaLogic Menlow based board",
292 .matches = {
293 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 },
296 },
288 { } 297 { }
289}; 298};
290 299
@@ -295,68 +304,16 @@ static int __init reboot_init(void)
295} 304}
296core_initcall(reboot_init); 305core_initcall(reboot_init);
297 306
298/* The following code and data reboots the machine by switching to real 307extern const unsigned char machine_real_restart_asm[];
299 mode and jumping to the BIOS reset entry point, as if the CPU has 308extern const u64 machine_real_restart_gdt[3];
300 really been reset. The previous version asked the keyboard
301 controller to pulse the CPU reset line, which is more thorough, but
302 doesn't work with at least one type of 486 motherboard. It is easy
303 to stop this code working; hence the copious comments. */
304static const unsigned long long
305real_mode_gdt_entries [3] =
306{
307 0x0000000000000000ULL, /* Null descriptor */
308 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
309 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
310};
311 309
312static const struct desc_ptr 310void machine_real_restart(unsigned int type)
313real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
314real_mode_idt = { 0x3ff, 0 };
315
316/* This is 16-bit protected mode code to disable paging and the cache,
317 switch to real mode and jump to the BIOS reset code.
318
319 The instruction that switches to real mode by writing to CR0 must be
320 followed immediately by a far jump instruction, which set CS to a
321 valid value for real mode, and flushes the prefetch queue to avoid
322 running instructions that have already been decoded in protected
323 mode.
324
325 Clears all the flags except ET, especially PG (paging), PE
326 (protected-mode enable) and TS (task switch for coprocessor state
327 save). Flushes the TLB after paging has been disabled. Sets CD and
328 NW, to disable the cache on a 486, and invalidates the cache. This
329 is more like the state of a 486 after reset. I don't know if
330 something else should be done for other chips.
331
332 More could be done here to set up the registers as if a CPU reset had
333 occurred; hopefully real BIOSs don't assume much. */
334static const unsigned char real_mode_switch [] =
335{
336 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
337 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
338 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
339 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
340 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
341 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
342 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
343 0x74, 0x02, /* jz f */
344 0x0f, 0x09, /* wbinvd */
345 0x24, 0x10, /* f: andb $0x10,al */
346 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
347};
348static const unsigned char jump_to_bios [] =
349{ 311{
350 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 312 void *restart_va;
351}; 313 unsigned long restart_pa;
314 void (*restart_lowmem)(unsigned int);
315 u64 *lowmem_gdt;
352 316
353/*
354 * Switch to real mode and then execute the code
355 * specified by the code and length parameters.
356 * We assume that length will aways be less that 100!
357 */
358void machine_real_restart(const unsigned char *code, int length)
359{
360 local_irq_disable(); 317 local_irq_disable();
361 318
362 /* Write zero to CMOS register number 0x0f, which the BIOS POST 319 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -384,41 +341,23 @@ void machine_real_restart(const unsigned char *code, int length)
384 too. */ 341 too. */
385 *((unsigned short *)0x472) = reboot_mode; 342 *((unsigned short *)0x472) = reboot_mode;
386 343
387 /* For the switch to real mode, copy some code to low memory. It has 344 /* Patch the GDT in the low memory trampoline */
388 to be in the first 64k because it is running in 16-bit mode, and it 345 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
389 has to have the same physical and virtual address, because it turns 346
390 off paging. Copy it near the end of the first page, out of the way 347 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
391 of BIOS variables. */ 348 restart_pa = virt_to_phys(restart_va);
392 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 349 restart_lowmem = (void (*)(unsigned int))restart_pa;
393 real_mode_switch, sizeof (real_mode_switch)); 350
394 memcpy((void *)(0x1000 - 100), code, length); 351 /* GDT[0]: GDT self-pointer */
395 352 lowmem_gdt[0] =
396 /* Set up the IDT for real mode. */ 353 (u64)(sizeof(machine_real_restart_gdt) - 1) +
397 load_idt(&real_mode_idt); 354 ((u64)virt_to_phys(lowmem_gdt) << 16);
398 355 /* GDT[1]: 64K real mode code segment */
399 /* Set up a GDT from which we can load segment descriptors for real 356 lowmem_gdt[1] =
400 mode. The GDT is not used in real mode; it is just needed here to 357 GDT_ENTRY(0x009b, restart_pa, 0xffff);
401 prepare the descriptors. */ 358
402 load_gdt(&real_mode_gdt); 359 /* Jump to the identity-mapped low memory code */
403 360 restart_lowmem(type);
404 /* Load the data segment registers, and thus the descriptors ready for
405 real mode. The base address of each segment is 0x100, 16 times the
406 selector value being loaded here. This is so that the segment
407 registers don't have to be reloaded after switching to real mode:
408 the values are consistent for real mode operation already. */
409 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
410 "\tmovl %%eax,%%ds\n"
411 "\tmovl %%eax,%%es\n"
412 "\tmovl %%eax,%%fs\n"
413 "\tmovl %%eax,%%gs\n"
414 "\tmovl %%eax,%%ss" : : : "eax");
415
416 /* Jump to the 16-bit code that we copied earlier. It disables paging
417 and the cache, switches to real mode, and jumps to the BIOS reset
418 entry point. */
419 __asm__ __volatile__ ("ljmp $0x0008,%0"
420 :
421 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
422} 361}
423#ifdef CONFIG_APM_MODULE 362#ifdef CONFIG_APM_MODULE
424EXPORT_SYMBOL(machine_real_restart); 363EXPORT_SYMBOL(machine_real_restart);
@@ -573,7 +512,7 @@ static void native_machine_emergency_restart(void)
573 512
574#ifdef CONFIG_X86_32 513#ifdef CONFIG_X86_32
575 case BOOT_BIOS: 514 case BOOT_BIOS:
576 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 515 machine_real_restart(MRR_BIOS);
577 516
578 reboot_type = BOOT_KBD; 517 reboot_type = BOOT_KBD;
579 break; 518 break;
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..1d5c46df0d78
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $(1b - r_base), %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
32 movw %ax, (101f - r_base)(%ebx)
33 movw %cx, (102f - r_base)(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl (machine_real_restart_idt - r_base)(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl (machine_real_restart_gdt - r_base)(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 6f39cab052d5..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3cfe26c0252..4be9b398470e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h> 115#include <asm/alternative.h>
116#include <asm/prom.h>
116 117
117/* 118/*
118 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -297,6 +298,9 @@ static void __init init_gbpages(void)
297static inline void init_gbpages(void) 298static inline void init_gbpages(void)
298{ 299{
299} 300}
301static void __init cleanup_highmap(void)
302{
303}
300#endif 304#endif
301 305
302static void __init reserve_brk(void) 306static void __init reserve_brk(void)
@@ -429,16 +433,30 @@ static void __init parse_setup_data(void)
429 return; 433 return;
430 pa_data = boot_params.hdr.setup_data; 434 pa_data = boot_params.hdr.setup_data;
431 while (pa_data) { 435 while (pa_data) {
432 data = early_memremap(pa_data, PAGE_SIZE); 436 u32 data_len, map_len;
437
438 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
439 (u64)sizeof(struct setup_data));
440 data = early_memremap(pa_data, map_len);
441 data_len = data->len + sizeof(struct setup_data);
442 if (data_len > map_len) {
443 early_iounmap(data, map_len);
444 data = early_memremap(pa_data, data_len);
445 map_len = data_len;
446 }
447
433 switch (data->type) { 448 switch (data->type) {
434 case SETUP_E820_EXT: 449 case SETUP_E820_EXT:
435 parse_e820_ext(data, pa_data); 450 parse_e820_ext(data);
451 break;
452 case SETUP_DTB:
453 add_dtb(pa_data);
436 break; 454 break;
437 default: 455 default:
438 break; 456 break;
439 } 457 }
440 pa_data = data->next; 458 pa_data = data->next;
441 early_iounmap(data, PAGE_SIZE); 459 early_iounmap(data, map_len);
442 } 460 }
443} 461}
444 462
@@ -601,28 +619,6 @@ void __init reserve_standard_io_resources(void)
601 619
602} 620}
603 621
604/*
605 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
606 * is_kdump_kernel() to determine if we are booting after a panic. Hence
607 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
608 */
609
610#ifdef CONFIG_CRASH_DUMP
611/* elfcorehdr= specifies the location of elf core header
612 * stored by the crashed kernel. This option will be passed
613 * by kexec loader to the capture kernel.
614 */
615static int __init setup_elfcorehdr(char *arg)
616{
617 char *end;
618 if (!arg)
619 return -EINVAL;
620 elfcorehdr_addr = memparse(arg, &end);
621 return end > arg ? 0 : -EINVAL;
622}
623early_param("elfcorehdr", setup_elfcorehdr);
624#endif
625
626static __init void reserve_ibft_region(void) 622static __init void reserve_ibft_region(void)
627{ 623{
628 unsigned long addr, size = 0; 624 unsigned long addr, size = 0;
@@ -680,15 +676,6 @@ static int __init parse_reservelow(char *p)
680 676
681early_param("reservelow", parse_reservelow); 677early_param("reservelow", parse_reservelow);
682 678
683static u64 __init get_max_mapped(void)
684{
685 u64 end = max_pfn_mapped;
686
687 end <<= PAGE_SHIFT;
688
689 return end;
690}
691
692/* 679/*
693 * Determine if we were loaded by an EFI loader. If so, then we have also been 680 * Determine if we were loaded by an EFI loader. If so, then we have also been
694 * passed the efi memmap, systab, etc., so we should use these data structures 681 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -704,8 +691,6 @@ static u64 __init get_max_mapped(void)
704 691
705void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
706{ 693{
707 int acpi = 0;
708 int amd = 0;
709 unsigned long flags; 694 unsigned long flags;
710 695
711#ifdef CONFIG_X86_32 696#ifdef CONFIG_X86_32
@@ -922,6 +907,8 @@ void __init setup_arch(char **cmdline_p)
922 */ 907 */
923 reserve_brk(); 908 reserve_brk();
924 909
910 cleanup_highmap();
911
925 memblock.current_limit = get_max_mapped(); 912 memblock.current_limit = get_max_mapped();
926 memblock_x86_fill(); 913 memblock_x86_fill();
927 914
@@ -935,15 +922,8 @@ void __init setup_arch(char **cmdline_p)
935 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 922 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
936 max_pfn_mapped<<PAGE_SHIFT); 923 max_pfn_mapped<<PAGE_SHIFT);
937 924
938 reserve_trampoline_memory(); 925 setup_trampolines();
939 926
940#ifdef CONFIG_ACPI_SLEEP
941 /*
942 * Reserve low memory region for sleep support.
943 * even before init_memory_mapping
944 */
945 acpi_reserve_wakeup_memory();
946#endif
947 init_gbpages(); 927 init_gbpages();
948 928
949 /* max_pfn_mapped is updated here */ 929 /* max_pfn_mapped is updated here */
@@ -984,19 +964,7 @@ void __init setup_arch(char **cmdline_p)
984 964
985 early_acpi_boot_init(); 965 early_acpi_boot_init();
986 966
987#ifdef CONFIG_ACPI_NUMA 967 initmem_init();
988 /*
989 * Parse SRAT to discover nodes.
990 */
991 acpi = acpi_numa_init();
992#endif
993
994#ifdef CONFIG_AMD_NUMA
995 if (!acpi)
996 amd = !amd_numa_init(0, max_pfn);
997#endif
998
999 initmem_init(0, max_pfn, acpi, amd);
1000 memblock_find_dma_reserve(); 968 memblock_find_dma_reserve();
1001 dma32_reserve_bootmem(); 969 dma32_reserve_bootmem();
1002 970
@@ -1008,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
1008 paging_init(); 976 paging_init();
1009 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 977 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1010 978
979 if (boot_cpu_data.cpuid_level >= 0) {
980 /* A CPU has %cr4 if and only if it has CPUID */
981 mmu_cr4_features = read_cr4();
982 }
983
1011#ifdef CONFIG_X86_32 984#ifdef CONFIG_X86_32
1012 /* sync back kernel address range */ 985 /* sync back kernel address range */
1013 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, 986 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
@@ -1029,8 +1002,8 @@ void __init setup_arch(char **cmdline_p)
1029 * Read APIC and some other early information from ACPI tables. 1002 * Read APIC and some other early information from ACPI tables.
1030 */ 1003 */
1031 acpi_boot_init(); 1004 acpi_boot_init();
1032
1033 sfi_init(); 1005 sfi_init();
1006 x86_dtb_init();
1034 1007
1035 /* 1008 /*
1036 * get boot-time SMP configuration: 1009 * get boot-time SMP configuration:
@@ -1040,9 +1013,7 @@ void __init setup_arch(char **cmdline_p)
1040 1013
1041 prefill_possible_map(); 1014 prefill_possible_map();
1042 1015
1043#ifdef CONFIG_X86_64
1044 init_cpu_to_node(); 1016 init_cpu_to_node();
1045#endif
1046 1017
1047 init_apic_mappings(); 1018 init_apic_mappings();
1048 ioapic_and_gsi_init(); 1019 ioapic_and_gsi_init();
@@ -1066,6 +1037,8 @@ void __init setup_arch(char **cmdline_p)
1066#endif 1037#endif
1067 x86_init.oem.banner(); 1038 x86_init.oem.banner();
1068 1039
1040 x86_init.timers.wallclock_init();
1041
1069 mcheck_init(); 1042 mcheck_init();
1070 1043
1071 local_irq_save(flags); 1044 local_irq_save(flags);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 002b79685f73..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void)
225 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
226 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
227#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
228#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
229 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
230 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
231 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
232#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
233 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
234 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void)
242 */ 247 */
243 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
244#endif 249#endif
245#endif
246 /* 250 /*
247 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
248 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
@@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void)
256 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
257 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
258#endif 262#endif
259#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
260 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
261#endif 268#endif
262 269
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 08776a953487..c2871d3c71b6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,6 +64,7 @@
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/mwait.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused)
355 cpu_idle(); 297 cpu_idle();
356} 298}
357 299
358#ifdef CONFIG_CPUMASK_OFFSTACK
359/* In this case, llc_shared_map is a pointer to a cpumask. */
360static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
361 const struct cpuinfo_x86 *src)
362{
363 struct cpumask *llc = dst->llc_shared_map;
364 *dst = *src;
365 dst->llc_shared_map = llc;
366}
367#else
368static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
369 const struct cpuinfo_x86 *src)
370{
371 *dst = *src;
372}
373#endif /* CONFIG_CPUMASK_OFFSTACK */
374
375/* 300/*
376 * The bootstrap kernel entry code has set these up. Save them for 301 * The bootstrap kernel entry code has set these up. Save them for
377 * a given CPU 302 * a given CPU
@@ -381,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id)
381{ 306{
382 struct cpuinfo_x86 *c = &cpu_data(id); 307 struct cpuinfo_x86 *c = &cpu_data(id);
383 308
384 copy_cpuinfo_x86(c, &boot_cpu_data); 309 *c = boot_cpu_data;
385 c->cpu_index = id; 310 c->cpu_index = id;
386 if (id != 0) 311 if (id != 0)
387 identify_secondary_cpu(c); 312 identify_secondary_cpu(c);
@@ -389,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id)
389 314
390static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 315static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
391{ 316{
392 struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
393 struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
394
395 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 317 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
396 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 318 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
397 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 319 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
398 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 320 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
399 cpumask_set_cpu(cpu1, c2->llc_shared_map); 321 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
400 cpumask_set_cpu(cpu2, c1->llc_shared_map); 322 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
401} 323}
402 324
403 325
@@ -414,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
414 336
415 if (cpu_has(c, X86_FEATURE_TOPOEXT)) { 337 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
416 if (c->phys_proc_id == o->phys_proc_id && 338 if (c->phys_proc_id == o->phys_proc_id &&
339 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
417 c->compute_unit_id == o->compute_unit_id) 340 c->compute_unit_id == o->compute_unit_id)
418 link_thread_siblings(cpu, i); 341 link_thread_siblings(cpu, i);
419 } else if (c->phys_proc_id == o->phys_proc_id && 342 } else if (c->phys_proc_id == o->phys_proc_id &&
@@ -425,7 +348,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
425 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 348 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
426 } 349 }
427 350
428 cpumask_set_cpu(cpu, c->llc_shared_map); 351 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
429 352
430 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { 353 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 354 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -436,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
436 for_each_cpu(i, cpu_sibling_setup_mask) { 359 for_each_cpu(i, cpu_sibling_setup_mask) {
437 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 360 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
438 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 361 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
439 cpumask_set_cpu(i, c->llc_shared_map); 362 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
440 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 363 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
441 } 364 }
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 365 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
443 cpumask_set_cpu(i, cpu_core_mask(cpu)); 366 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -476,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
476 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 399 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
477 return cpu_core_mask(cpu); 400 return cpu_core_mask(cpu);
478 else 401 else
479 return c->llc_shared_map; 402 return cpu_llc_shared_mask(cpu);
480} 403}
481 404
482static void impress_friends(void) 405static void impress_friends(void)
@@ -788,7 +711,7 @@ do_rest:
788 stack_start = c_idle.idle->thread.sp; 711 stack_start = c_idle.idle->thread.sp;
789 712
790 /* start_ip had better be page-aligned! */ 713 /* start_ip had better be page-aligned! */
791 start_ip = setup_trampoline(); 714 start_ip = trampoline_address();
792 715
793 /* So we see what's up */ 716 /* So we see what's up */
794 announce_cpu(cpu, apicid); 717 announce_cpu(cpu, apicid);
@@ -798,6 +721,8 @@ do_rest:
798 * the targeted processor. 721 * the targeted processor.
799 */ 722 */
800 723
724 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
725
801 atomic_set(&init_deasserted, 0); 726 atomic_set(&init_deasserted, 0);
802 727
803 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 728 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -851,8 +776,8 @@ do_rest:
851 pr_debug("CPU%d: has booted.\n", cpu); 776 pr_debug("CPU%d: has booted.\n", cpu);
852 else { 777 else {
853 boot_error = 1; 778 boot_error = 1;
854 if (*((volatile unsigned char *)trampoline_base) 779 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
855 == 0xA5) 780 == 0xA5A5A5A5)
856 /* trampoline started but...? */ 781 /* trampoline started but...? */
857 pr_err("CPU%d: Stuck ??\n", cpu); 782 pr_err("CPU%d: Stuck ??\n", cpu);
858 else 783 else
@@ -878,7 +803,7 @@ do_rest:
878 } 803 }
879 804
880 /* mark "stuck" area as not stuck */ 805 /* mark "stuck" area as not stuck */
881 *((volatile unsigned long *)trampoline_base) = 0; 806 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
882 807
883 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 808 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
884 /* 809 /*
@@ -945,6 +870,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
945 return 0; 870 return 0;
946} 871}
947 872
873/**
874 * arch_disable_smp_support() - disables SMP support for x86 at runtime
875 */
876void arch_disable_smp_support(void)
877{
878 disable_ioapic_support();
879}
880
948/* 881/*
949 * Fall back to non SMP mode after errors. 882 * Fall back to non SMP mode after errors.
950 * 883 *
@@ -960,7 +893,6 @@ static __init void disable_smp(void)
960 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 893 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
961 else 894 else
962 physid_set_mask_of_physid(0, &phys_cpu_present_map); 895 physid_set_mask_of_physid(0, &phys_cpu_present_map);
963 map_cpu_to_logical_apicid();
964 cpumask_set_cpu(0, cpu_sibling_mask(0)); 896 cpumask_set_cpu(0, cpu_sibling_mask(0));
965 cpumask_set_cpu(0, cpu_core_mask(0)); 897 cpumask_set_cpu(0, cpu_core_mask(0));
966} 898}
@@ -1045,7 +977,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1045 "(tell your hw vendor)\n"); 977 "(tell your hw vendor)\n");
1046 } 978 }
1047 smpboot_clear_io_apic(); 979 smpboot_clear_io_apic();
1048 arch_disable_smp_support(); 980 disable_ioapic_support();
1049 return -1; 981 return -1;
1050 } 982 }
1051 983
@@ -1089,21 +1021,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1089 1021
1090 preempt_disable(); 1022 preempt_disable();
1091 smp_cpu_index_default(); 1023 smp_cpu_index_default();
1092 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); 1024
1093 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1094 mb();
1095 /* 1025 /*
1096 * Setup boot CPU information 1026 * Setup boot CPU information
1097 */ 1027 */
1098 smp_store_cpu_info(0); /* Final full version of the data */ 1028 smp_store_cpu_info(0); /* Final full version of the data */
1099#ifdef CONFIG_X86_32 1029 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1100 boot_cpu_logical_apicid = logical_smp_processor_id(); 1030 mb();
1101#endif 1031
1102 current_thread_info()->cpu = 0; /* needed? */ 1032 current_thread_info()->cpu = 0; /* needed? */
1103 for_each_possible_cpu(i) { 1033 for_each_possible_cpu(i) {
1104 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1034 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1105 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1035 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1106 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1036 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1107 } 1037 }
1108 set_cpu_sibling_map(0); 1038 set_cpu_sibling_map(0);
1109 1039
@@ -1139,8 +1069,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1139 1069
1140 bsp_end_local_APIC_setup(); 1070 bsp_end_local_APIC_setup();
1141 1071
1142 map_cpu_to_logical_apicid();
1143
1144 if (apic->setup_portio_remap) 1072 if (apic->setup_portio_remap)
1145 apic->setup_portio_remap(); 1073 apic->setup_portio_remap();
1146 1074
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 938c8e10a19a..6515733a289d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,7 +73,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
@@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, regs, NULL, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786dc9b8f..abce34d5c79d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,7 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a375616d77f7..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,39 +2,41 @@
2#include <linux/memblock.h> 2#include <linux/memblock.h>
3 3
4#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
5#include <asm/pgtable.h> 6#include <asm/pgtable.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 phys_addr_t mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == MEMBLOCK_ERROR) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base); 38
39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
40 return 0;
40} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 075d130efcf9..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -129,6 +128,7 @@ no_longmode:
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ffe5755caa8b..9335bf7dd2e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
427 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
428 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
429 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
430 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
431 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
432 * then we discard the result and have another try. 432 * then we discard the result and have another try.
433 * 433 *
@@ -900,7 +900,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
900 * timer based, instead of loop based, we don't block the boot 900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done. 901 * process while this longer calibration is done.
902 * 902 *
903 * If there are any calibration anomolies (too many SMIs, etc), 903 * If there are any calibration anomalies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early 904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the 905 * calibration, we throw out the new calibration and use the
906 * early calibration. 906 * early calibration.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 0edefc19a113..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -18,7 +18,7 @@
18 * This file is expected to run in 32bit code. Currently: 18 * This file is expected to run in 32bit code. Currently:
19 * 19 *
20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication 21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup 22 * arch/x86/kernel/head_32.S: processor startup
23 * 23 *
24 * verify_cpu, returns the status of longmode and SSE in register %eax. 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf4700755184..624a2016198e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -230,7 +231,7 @@ SECTIONS
230 * output PHDR, so the next output section - .init.text - should 231 * output PHDR, so the next output section - .init.text - should
231 * start another segment - init. 232 * start another segment - init.
232 */ 233 */
233 PERCPU_VADDR(0, :percpu) 234 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
234#endif 235#endif
235 236
236 INIT_TEXT_SECTION(PAGE_SIZE) 237 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -240,6 +241,18 @@ SECTIONS
240 241
241 INIT_DATA_SECTION(16) 242 INIT_DATA_SECTION(16)
242 243
244 /*
245 * Code and data for a variety of lowlevel trampolines, to be
246 * copied into base memory (< 1 MiB) during initialization.
247 * Since it is copied early, the main copy can be discarded
248 * afterwards.
249 */
250 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
251 x86_trampoline_start = .;
252 *(.x86_trampoline)
253 x86_trampoline_end = .;
254 }
255
243 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 256 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
244 __x86_cpu_dev_start = .; 257 __x86_cpu_dev_start = .;
245 *(.x86_cpu_dev.init) 258 *(.x86_cpu_dev.init)
@@ -291,6 +304,7 @@ SECTIONS
291 *(.iommu_table) 304 *(.iommu_table)
292 __iommu_table_end = .; 305 __iommu_table_end = .;
293 } 306 }
307
294 . = ALIGN(8); 308 . = ALIGN(8);
295 /* 309 /*
296 * .exit.text is discard at runtime, not link time, to deal with 310 * .exit.text is discard at runtime, not link time, to deal with
@@ -305,7 +319,7 @@ SECTIONS
305 } 319 }
306 320
307#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
308 PERCPU(THREAD_SIZE) 322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
309#endif 323#endif
310 324
311 . = ALIGN(PAGE_SIZE); 325 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ceb2911aa439..c11514e9128b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
70 .setup_percpu_clockev = setup_boot_APIC_clock, 70 .setup_percpu_clockev = setup_boot_APIC_clock,
71 .tsc_pre_init = x86_init_noop, 71 .tsc_pre_init = x86_init_noop,
72 .timer_init = hpet_time_init, 72 .timer_init = hpet_time_init,
73 .wallclock_init = x86_init_noop,
73 }, 74 },
74 75
75 .iommu = { 76 .iommu = {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 547128546cc3..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
78/* Misc flags */ 78/* Misc flags */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */
79#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
80#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
81#define Undefined (1<<25) /* No Such Instruction */ 82#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
877 if (selector & 1 << 2) { 878 if (selector & 1 << 2) {
878 struct desc_struct desc; 879 struct desc_struct desc;
879 memset (dt, 0, sizeof *dt); 880 memset (dt, 0, sizeof *dt);
880 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
882 ctxt->vcpu))
881 return; 883 return;
882 884
883 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
929 return ret; 931 return ret;
930} 932}
931 933
934/* Does not support long mode */
932static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 935static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
933 struct x86_emulate_ops *ops, 936 struct x86_emulate_ops *ops,
934 u16 selector, int seg) 937 u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1040 } 1043 }
1041load: 1044load:
1042 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1045 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1043 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1044 return X86EMUL_CONTINUE; 1047 return X86EMUL_CONTINUE;
1045exception: 1048exception:
1046 emulate_exception(ctxt, err_vec, err_code, true); 1049 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1560 struct desc_struct *ss) 1563 struct desc_struct *ss)
1561{ 1564{
1562 memset(cs, 0, sizeof(struct desc_struct)); 1565 memset(cs, 0, sizeof(struct desc_struct));
1563 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
1564 memset(ss, 0, sizeof(struct desc_struct)); 1567 memset(ss, 0, sizeof(struct desc_struct));
1565 1568
1566 cs->l = 0; /* will be adjusted later */ 1569 cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1607 cs.d = 0; 1610 cs.d = 0;
1608 cs.l = 1; 1611 cs.l = 1;
1609 } 1612 }
1610 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1611 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1612 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1613 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1614 1617
1615 c->regs[VCPU_REGS_RCX] = c->eip; 1618 c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1679 cs.l = 1; 1682 cs.l = 1;
1680 } 1683 }
1681 1684
1682 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1683 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1684 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1685 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1686 1689
1687 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1736 cs_sel |= SELECTOR_RPL_MASK; 1739 cs_sel |= SELECTOR_RPL_MASK;
1737 ss_sel |= SELECTOR_RPL_MASK; 1740 ss_sel |= SELECTOR_RPL_MASK;
1738 1741
1739 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1740 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1741 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1742 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1743 1746
1744 c->eip = c->regs[VCPU_REGS_RDX]; 1747 c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1764 u16 port, u16 len) 1767 u16 port, u16 len)
1765{ 1768{
1766 struct desc_struct tr_seg; 1769 struct desc_struct tr_seg;
1770 u32 base3;
1767 int r; 1771 int r;
1768 u16 io_bitmap_ptr; 1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
1769 u8 perm, bit_idx = port & 0x7;
1770 unsigned mask = (1 << len) - 1; 1773 unsigned mask = (1 << len) - 1;
1774 unsigned long base;
1771 1775
1772 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
1773 if (!tr_seg.p) 1777 if (!tr_seg.p)
1774 return false; 1778 return false;
1775 if (desc_limit_scaled(&tr_seg) < 103) 1779 if (desc_limit_scaled(&tr_seg) < 103)
1776 return false; 1780 return false;
1777 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 1781 base = get_desc_base(&tr_seg);
1778 ctxt->vcpu, NULL); 1782#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32;
1784#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
1779 if (r != X86EMUL_CONTINUE) 1786 if (r != X86EMUL_CONTINUE)
1780 return false; 1787 return false;
1781 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1782 return false; 1789 return false;
1783 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
1784 &perm, 1, ctxt->vcpu, NULL); 1791 NULL);
1785 if (r != X86EMUL_CONTINUE) 1792 if (r != X86EMUL_CONTINUE)
1786 return false; 1793 return false;
1787 if ((perm >> bit_idx) & mask) 1794 if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2126 } 2133 }
2127 2134
2128 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2129 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
2130 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); 2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2131 2138
2132 if (has_error_code) { 2139 if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
2365 D(SrcMem16 | ModRM | Mov | Priv), 2372 D(SrcMem16 | ModRM | Mov | Priv),
2366 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2367}, { 2374}, {
2368 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), 2375 D(SrcNone | ModRM | Priv | VendorSpecific), N,
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific),
2369 D(SrcNone | ModRM | DstMem | Mov), N, 2377 D(SrcNone | ModRM | DstMem | Mov), N,
2370 D(SrcMem16 | ModRM | Mov | Priv), N, 2378 D(SrcMem16 | ModRM | Mov | Priv), N,
2371} }; 2379} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
2489static struct opcode twobyte_table[256] = { 2497static struct opcode twobyte_table[256] = {
2490 /* 0x00 - 0x0F */ 2498 /* 0x00 - 0x0F */
2491 N, GD(0, &group7), N, N, 2499 N, GD(0, &group7), N, N,
2492 N, D(ImplicitOps), D(ImplicitOps | Priv), N, 2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
2493 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2494 N, D(ImplicitOps | ModRM), N, N, 2502 N, D(ImplicitOps | ModRM), N, N,
2495 /* 0x10 - 0x1F */ 2503 /* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
2502 /* 0x30 - 0x3F */ 2510 /* 0x30 - 0x3F */
2503 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2504 D(ImplicitOps | Priv), N, 2512 D(ImplicitOps | Priv), N,
2505 D(ImplicitOps), D(ImplicitOps | Priv), N, N, 2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N,
2506 N, N, N, N, N, N, N, N, 2515 N, N, N, N, N, N, N, N,
2507 /* 0x40 - 0x4F */ 2516 /* 0x40 - 0x4F */
2508 X16(D(DstReg | SrcMem | ModRM | Mov)), 2517 X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
2741 if (c->d == 0 || (c->d & Undefined)) 2750 if (c->d == 0 || (c->d & Undefined))
2742 return -1; 2751 return -1;
2743 2752
2753 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
2754 return -1;
2755
2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2756 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2745 c->op_bytes = 8; 2757 c->op_bytes = 8;
2746 2758
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
70 kvm_make_request(KVM_REQ_EVENT, found); 67 kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
75static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
76{ 73{
77 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
78 s->isr_ack |= (1 << irq);
79 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
80 irq += 8; 76 irq += 8;
81 /* 77 /*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
89 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
90} 86}
91 87
92void kvm_pic_clear_isr_ack(struct kvm *kvm)
93{
94 struct kvm_pic *s = pic_irqchip(kvm);
95
96 pic_lock(s);
97 s->pics[0].isr_ack = 0xff;
98 s->pics[1].isr_ack = 0xff;
99 pic_unlock(s);
100}
101
102/* 88/*
103 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
104 */ 90 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->irr = 0; 267 s->irr = 0;
282 s->imr = 0; 268 s->imr = 0;
283 s->isr = 0; 269 s->isr = 0;
284 s->isr_ack = 0xff;
285 s->priority_add = 0; 270 s->priority_add = 0;
286 s->irq_base = 0; 271 s->irq_base = 0;
287 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
545 */ 530 */
546static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
547{ 532{
548 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
549 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
550 int irq = pic_get_irq(&s->pics[0]);
551 534
552 s->output = level; 535 if (!s->output)
553 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
554 s->pics[0].isr_ack &= ~(1 << irq);
555 s->wakeup_needed = true; 536 s->wakeup_needed = true;
556 } 537 s->output = level;
557} 538}
558 539
559static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 557 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 558 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
580 559
581 /* 560 /*
582 * Initialize PIO device 561 * Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
417 case APIC_DM_INIT: 417 case APIC_DM_INIT:
418 if (level) { 418 if (level) {
419 result = 1; 419 result = 1;
420 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
421 printk(KERN_DEBUG
422 "INIT on a runnable vcpu %d\n",
423 vcpu->vcpu_id);
424 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
425 kvm_make_request(KVM_REQ_EVENT, vcpu); 421 kvm_make_request(KVM_REQ_EVENT, vcpu);
426 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
875 871
876 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
877 873
878 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
879 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
880 876
881 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
882} 878}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1065 1061
1066 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1067 1063
1068 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1069 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1070 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1071 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1072 goto nomem_free_apic; 1068 goto nomem_free_apic;
1073 } 1069 }
1074 apic->regs = page_address(apic->regs_page);
1075 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1076 1071
1077 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..22fae7593ee7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
111#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119 116
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
123#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125 122
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
379static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
380 int min) 375 int min)
381{ 376{
382 struct page *page; 377 void *page;
383 378
384 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
385 return 0; 380 return 0;
386 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
387 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
388 if (!page) 383 if (!page)
389 return -ENOMEM; 384 return -ENOMEM;
390 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
391 } 386 }
392 return 0; 387 return 0;
393} 388}
@@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 549 return ret;
555} 550}
556 551
557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
558{ 555{
559 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
560 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 557
561 if (slot && slot->dirty_bitmap) 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
562 return true; 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
563 return false; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
564} 569}
565 570
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1032 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
1033 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
1034 list_del(&sp->link); 1039 list_del(&sp->link);
1035 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
1036 if (!sp->role.direct) 1041 if (!sp->role.direct)
1037 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
1038 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
1039 kvm_mod_used_mmu_pages(kvm, -1); 1044 kvm_mod_used_mmu_pages(kvm, -1);
1040} 1045}
@@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1199{ 1204{
1200} 1205}
1201 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq)
1210{
1211 WARN_ON(1);
1212}
1213
1202#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1203 1215
1204struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2150{ 2162{
2151} 2163}
2152 2164
2153static struct kvm_memory_slot *
2154pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2155{
2156 struct kvm_memory_slot *slot;
2157
2158 slot = gfn_to_memslot(vcpu->kvm, gfn);
2159 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2160 (no_dirty_log && slot->dirty_bitmap))
2161 slot = NULL;
2162
2163 return slot;
2164}
2165
2166static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2167 bool no_dirty_log) 2166 bool no_dirty_log)
2168{ 2167{
2169 struct kvm_memory_slot *slot; 2168 struct kvm_memory_slot *slot;
2170 unsigned long hva; 2169 unsigned long hva;
2171 2170
2172 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); 2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2173 if (!slot) { 2172 if (!slot) {
2174 get_page(bad_page); 2173 get_page(bad_page);
2175 return page_to_pfn(bad_page); 2174 return page_to_pfn(bad_page);
@@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2190 gfn_t gfn; 2189 gfn_t gfn;
2191 2190
2192 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2193 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) 2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2194 return -1; 2193 return -1;
2195 2194
2196 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2804 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2805 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2806 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2807 context->root_level = 0; 2807 context->root_level = 0;
2808 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
@@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2933 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2936 context->free = paging_free; 2937 context->free = paging_free;
2937 context->root_level = level; 2938 context->root_level = level;
2938 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
@@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2961 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2962 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2963 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2964 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2965 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2966 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
@@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2985 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2986 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2987 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2988 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2989 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2990 context->direct_map = true; 2993 context->direct_map = true;
@@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3089 3092
3090static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3091{ 3094{
3092 vcpu->arch.update_pte.pfn = bad_pfn;
3093
3094 if (mmu_is_nested(vcpu)) 3095 if (mmu_is_nested(vcpu))
3095 return init_kvm_nested_mmu(vcpu); 3096 return init_kvm_nested_mmu(vcpu);
3096 else if (tdp_enabled) 3097 else if (tdp_enabled)
@@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3164static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3165 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp,
3166 u64 *spte, 3167 u64 *spte,
3167 const void *new) 3168 const void *new, unsigned long mmu_seq)
3168{ 3169{
3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170 ++vcpu->kvm->stat.mmu_pde_zapped; 3171 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3172 } 3173 }
3173 3174
3174 ++vcpu->kvm->stat.mmu_pte_updated; 3175 ++vcpu->kvm->stat.mmu_pte_updated;
3175 if (!sp->role.cr4_pae) 3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3176 paging32_update_pte(vcpu, sp, spte, new);
3177 else
3178 paging64_update_pte(vcpu, sp, spte, new);
3179} 3177}
3180 3178
3181static bool need_remote_flush(u64 old, u64 new) 3179static bool need_remote_flush(u64 old, u64 new)
@@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3210 return !!(spte && (*spte & shadow_accessed_mask)); 3208 return !!(spte && (*spte & shadow_accessed_mask));
3211} 3209}
3212 3210
3213static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 u64 gpte)
3215{
3216 gfn_t gfn;
3217 pfn_t pfn;
3218
3219 if (!is_present_gpte(gpte))
3220 return;
3221 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3222
3223 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3224 smp_rmb();
3225 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3226
3227 if (is_error_pfn(pfn)) {
3228 kvm_release_pfn_clean(pfn);
3229 return;
3230 }
3231 vcpu->arch.update_pte.gfn = gfn;
3232 vcpu->arch.update_pte.pfn = pfn;
3233}
3234
3235static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3211static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3236{ 3212{
3237 u64 *spte = vcpu->arch.last_pte_updated; 3213 u64 *spte = vcpu->arch.last_pte_updated;
@@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3253 struct kvm_mmu_page *sp; 3229 struct kvm_mmu_page *sp;
3254 struct hlist_node *node; 3230 struct hlist_node *node;
3255 LIST_HEAD(invalid_list); 3231 LIST_HEAD(invalid_list);
3256 u64 entry, gentry; 3232 unsigned long mmu_seq;
3257 u64 *spte; 3233 u64 entry, gentry, *spte;
3258 unsigned offset = offset_in_page(gpa); 3234 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3259 unsigned pte_size; 3235 int level, npte, invlpg_counter, r, flooded = 0;
3260 unsigned page_offset;
3261 unsigned misaligned;
3262 unsigned quadrant;
3263 int level;
3264 int flooded = 0;
3265 int npte;
3266 int r;
3267 int invlpg_counter;
3268 bool remote_flush, local_flush, zap_page; 3236 bool remote_flush, local_flush, zap_page;
3269 3237
3270 zap_page = remote_flush = local_flush = false; 3238 zap_page = remote_flush = local_flush = false;
3239 offset = offset_in_page(gpa);
3271 3240
3272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3241 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3273 3242
@@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3275 3244
3276 /* 3245 /*
3277 * Assume that the pte write on a page table of the same type 3246 * Assume that the pte write on a page table of the same type
3278 * as the current vcpu paging mode. This is nearly always true 3247 * as the current vcpu paging mode since we update the sptes only
3279 * (might be false while changing modes). Note it is verified later 3248 * when they have the same mode.
3280 * by update_pte().
3281 */ 3249 */
3282 if ((is_pae(vcpu) && bytes == 4) || !new) { 3250 if ((is_pae(vcpu) && bytes == 4) || !new) {
3283 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3251 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3303 break; 3271 break;
3304 } 3272 }
3305 3273
3306 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3307 spin_lock(&vcpu->kvm->mmu_lock); 3277 spin_lock(&vcpu->kvm->mmu_lock);
3308 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3309 gentry = 0; 3279 gentry = 0;
3310 kvm_mmu_access_page(vcpu, gfn);
3311 kvm_mmu_free_some_pages(vcpu); 3280 kvm_mmu_free_some_pages(vcpu);
3312 ++vcpu->kvm->stat.mmu_pte_write; 3281 ++vcpu->kvm->stat.mmu_pte_write;
3313 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3282 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3314 if (guest_initiated) { 3283 if (guest_initiated) {
3284 kvm_mmu_access_page(vcpu, gfn);
3315 if (gfn == vcpu->arch.last_pt_write_gfn 3285 if (gfn == vcpu->arch.last_pt_write_gfn
3316 && !last_updated_pte_accessed(vcpu)) { 3286 && !last_updated_pte_accessed(vcpu)) {
3317 ++vcpu->arch.last_pt_write_count; 3287 ++vcpu->arch.last_pt_write_count;
@@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3375 if (gentry && 3345 if (gentry &&
3376 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3377 & mask.word)) 3347 & mask.word))
3378 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 mmu_seq);
3379 if (!remote_flush && need_remote_flush(entry, *spte)) 3350 if (!remote_flush && need_remote_flush(entry, *spte))
3380 remote_flush = true; 3351 remote_flush = true;
3381 ++spte; 3352 ++spte;
@@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3385 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3356 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3357 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3358 spin_unlock(&vcpu->kvm->mmu_lock);
3388 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3389 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3390 vcpu->arch.update_pte.pfn = bad_pfn;
3391 }
3392} 3359}
3393 3360
3394int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3361int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3538 if (!test_bit(slot, sp->slot_bitmap)) 3505 if (!test_bit(slot, sp->slot_bitmap))
3539 continue; 3506 continue;
3540 3507
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3544 pt = sp->spt; 3508 pt = sp->spt;
3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3509 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 if (!is_shadow_present_pte(pt[i]) ||
3511 !is_last_spte(pt[i], sp->role.level))
3512 continue;
3513
3514 if (is_large_pte(pt[i])) {
3515 drop_spte(kvm, &pt[i],
3516 shadow_trap_nonpresent_pte);
3517 --kvm->stat.lpages;
3518 continue;
3519 }
3520
3546 /* avoid RMW */ 3521 /* avoid RMW */
3547 if (is_writable_pte(pt[i])) 3522 if (is_writable_pte(pt[i]))
3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3523 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 }
3549 } 3525 }
3550 kvm_flush_remote_tlbs(kvm); 3526 kvm_flush_remote_tlbs(kvm);
3551} 3527}
@@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 if (nr_to_scan == 0) 3559 if (nr_to_scan == 0)
3584 goto out; 3560 goto out;
3585 3561
3586 spin_lock(&kvm_lock); 3562 raw_spin_lock(&kvm_lock);
3587 3563
3588 list_for_each_entry(kvm, &vm_list, vm_list) { 3564 list_for_each_entry(kvm, &vm_list, vm_list) {
3589 int idx, freed_pages; 3565 int idx, freed_pages;
@@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3606 if (kvm_freed) 3582 if (kvm_freed)
3607 list_move_tail(&kvm_freed->vm_list, &vm_list); 3583 list_move_tail(&kvm_freed->vm_list, &vm_list);
3608 3584
3609 spin_unlock(&kvm_lock); 3585 raw_spin_unlock(&kvm_lock);
3610 3586
3611out: 3587out:
3612 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 3588 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..c6397795d865 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ no_present:
327} 325}
328 326
329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 u64 *spte, const void *pte) 328 u64 *spte, const void *pte, unsigned long mmu_seq)
331{ 329{
332 pt_element_t gpte; 330 pt_element_t gpte;
333 unsigned pte_access; 331 unsigned pte_access;
@@ -339,16 +337,16 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
339 337
340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 338 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 339 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 340 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
341 if (is_error_pfn(pfn)) {
342 kvm_release_pfn_clean(pfn);
343 return; 343 return;
344 pfn = vcpu->arch.update_pte.pfn; 344 }
345 if (is_error_pfn(pfn)) 345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
348 return; 346 return;
349 kvm_get_pfn(pfn); 347
350 /* 348 /*
351 * we call mmu_set_spte() with host_writable = true beacuse that 349 * we call mmu_set_spte() with host_writable = true because that
352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 350 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
353 */ 351 */
354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 352 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
829#undef FNAME 827#undef FNAME
830#undef PT_BASE_ADDR_MASK 828#undef PT_BASE_ADDR_MASK
831#undef PT_INDEX 829#undef PT_INDEX
832#undef PT_LEVEL_MASK
833#undef PT_LVL_ADDR_MASK 830#undef PT_LVL_ADDR_MASK
834#undef PT_LVL_OFFSET_MASK 831#undef PT_LVL_OFFSET_MASK
835#undef PT_LEVEL_BITS 832#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 54ce246a383e..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
135 135
136 u32 *msrpm; 136 u32 *msrpm;
137 137
138 ulong nmi_iret_rip;
139
138 struct nested_state nested; 140 struct nested_state nested;
139 141
140 bool nmi_singlestep; 142 bool nmi_singlestep;
@@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1153 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1155 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1154 load_gs_index(svm->host.gs); 1156 load_gs_index(svm->host.gs);
1155#else 1157#else
1158#ifdef CONFIG_X86_32_LAZY_GS
1156 loadsegment(gs, svm->host.gs); 1159 loadsegment(gs, svm->host.gs);
1157#endif 1160#endif
1161#endif
1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1162 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1163 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1160} 1164}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
2653 ++svm->vcpu.stat.nmi_window_exits; 2657 ++svm->vcpu.stat.nmi_window_exits;
2654 clr_intercept(svm, INTERCEPT_IRET); 2658 clr_intercept(svm, INTERCEPT_IRET);
2655 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2659 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2656 return 1; 2661 return 1;
2657} 2662}
2658 2663
@@ -2777,6 +2782,8 @@ static int dr_interception(struct vcpu_svm *svm)
2777 kvm_register_write(&svm->vcpu, reg, val); 2782 kvm_register_write(&svm->vcpu, reg, val);
2778 } 2783 }
2779 2784
2785 skip_emulated_instruction(&svm->vcpu);
2786
2780 return 1; 2787 return 1;
2781} 2788}
2782 2789
@@ -3472,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3472 3479
3473 svm->int3_injected = 0; 3480 svm->int3_injected = 0;
3474 3481
3475 if (svm->vcpu.arch.hflags & HF_IRET_MASK) { 3482 /*
3483 * If we've made progress since setting HF_IRET_MASK, we've
3484 * executed an IRET and can allow NMI injection.
3485 */
3486 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3476 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3488 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3477 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3489 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3478 } 3490 }
@@ -3639,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 3651 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3640#else 3652#else
3641 loadsegment(fs, svm->host.fs); 3653 loadsegment(fs, svm->host.fs);
3654#ifndef CONFIG_X86_32_LAZY_GS
3655 loadsegment(gs, svm->host.gs);
3656#endif
3642#endif 3657#endif
3643 3658
3644 reload_tss(vcpu); 3659 reload_tss(vcpu);
3645 3660
3646 local_irq_disable(); 3661 local_irq_disable();
3647 3662
3648 stgi();
3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3663 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3664 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3665 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3666 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654 3667
3668 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669 kvm_before_handle_nmi(&svm->vcpu);
3670
3671 stgi();
3672
3673 /* Any pending NMI will happen here */
3674
3675 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676 kvm_after_handle_nmi(&svm->vcpu);
3677
3655 sync_cr8_to_lapic(vcpu); 3678 sync_cr8_to_lapic(vcpu);
3656 3679
3657 svm->next_rip = 0; 3680 svm->next_rip = 0;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index fc7a101c4a35..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1357d7cf4ec8..db932760ea82 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..5b4cdcbd154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
93 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
94 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
95 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
96 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
97 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
98 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
99 * less than 2^12 cycles 99 * less than 2^12 cycles
100 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
101 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
102 */ 102 */
103#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
106module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 176 return container_of(vcpu, struct vcpu_vmx, vcpu);
177} 177}
178 178
179static int init_rmode(struct kvm *kvm);
180static u64 construct_eptp(unsigned long root_hpa); 179static u64 construct_eptp(unsigned long root_hpa);
181static void kvm_cpu_vmxon(u64 addr); 180static void kvm_cpu_vmxon(u64 addr);
182static void kvm_cpu_vmxoff(void); 181static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 182static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
183static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 184
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
1333 1333
1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335 if (msr & FEATURE_CONTROL_LOCKED) { 1335 if (msr & FEATURE_CONTROL_LOCKED) {
1336 /* launched w/ TXT and VMX disabled */
1336 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1337 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337 && tboot_enabled()) 1338 && tboot_enabled())
1338 return 1; 1339 return 1;
1340 /* launched w/o TXT and VMX only enabled w/ TXT */
1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1341 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1342 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1340 && !tboot_enabled()) { 1343 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 1344 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n"); 1345 "activate TXT before enabling KVM\n");
1343 return 1; 1346 return 1;
1344 } 1347 }
1348 /* launched w/o TXT and VMX disabled */
1349 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1350 && !tboot_enabled())
1351 return 1;
1345 } 1352 }
1346 1353
1347 return 0; 1354 return 0;
1348 /* locked but not enabled */
1349} 1355}
1350 1356
1351static void kvm_cpu_vmxon(u64 addr) 1357static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1683 vmx->emulation_required = 1; 1689 vmx->emulation_required = 1;
1684 vmx->rmode.vm86_active = 0; 1690 vmx->rmode.vm86_active = 0;
1685 1691
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1686 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1695 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1756 vmx->emulation_required = 1; 1763 vmx->emulation_required = 1;
1757 vmx->rmode.vm86_active = 1; 1764 vmx->rmode.vm86_active = 1;
1758 1765
1766 /*
1767 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1768 * vcpu. Call it here with phys address pointing 16M below 4G.
1769 */
1770 if (!vcpu->kvm->arch.tss_addr) {
1771 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1772 "called before entering vcpu\n");
1773 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1774 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 }
1777
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1759 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761 1781
@@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1794 1814
1795continue_rmode: 1815continue_rmode:
1796 kvm_mmu_reset_context(vcpu); 1816 kvm_mmu_reset_context(vcpu);
1797 init_rmode(vcpu->kvm);
1798} 1817}
1799 1818
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1819static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2030 vmcs_writel(GUEST_CR4, hw_cr4); 2049 vmcs_writel(GUEST_CR4, hw_cr4);
2031} 2050}
2032 2051
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037 return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu, 2052static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041 struct kvm_segment *var, int seg) 2053 struct kvm_segment *var, int seg)
2042{ 2054{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu);
2043 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save;
2044 u32 ar; 2058 u32 ar;
2045 2059
2060 if (vmx->rmode.vm86_active
2061 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2062 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2063 || seg == VCPU_SREG_GS)
2064 && !emulate_invalid_guest_state) {
2065 switch (seg) {
2066 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2067 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2068 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2069 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2070 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2071 default: BUG();
2072 }
2073 var->selector = save->selector;
2074 var->base = save->base;
2075 var->limit = save->limit;
2076 ar = save->ar;
2077 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector))
2079 goto use_saved_rmode_seg;
2080 }
2046 var->base = vmcs_readl(sf->base); 2081 var->base = vmcs_readl(sf->base);
2047 var->limit = vmcs_read32(sf->limit); 2082 var->limit = vmcs_read32(sf->limit);
2048 var->selector = vmcs_read16(sf->selector); 2083 var->selector = vmcs_read16(sf->selector);
2049 ar = vmcs_read32(sf->ar_bytes); 2084 ar = vmcs_read32(sf->ar_bytes);
2085use_saved_rmode_seg:
2050 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051 ar = 0; 2087 ar = 0;
2052 var->type = ar & 15; 2088 var->type = ar & 15;
@@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2060 var->unusable = (ar >> 16) & 1; 2096 var->unusable = (ar >> 16) & 1;
2061} 2097}
2062 2098
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s;
2103
2104 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg);
2106 return s.base;
2107 }
2108 return vmcs_readl(sf->base);
2109}
2110
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2111static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{ 2112{
2065 if (!is_protmode(vcpu)) 2113 if (!is_protmode(vcpu))
@@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2101 u32 ar; 2149 u32 ar;
2102 2150
2103 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector);
2104 vmx->rmode.tr.selector = var->selector; 2153 vmx->rmode.tr.selector = var->selector;
2105 vmx->rmode.tr.base = var->base; 2154 vmx->rmode.tr.base = var->base;
2106 vmx->rmode.tr.limit = var->limit; 2155 vmx->rmode.tr.limit = var->limit;
@@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2361 2410
2362static int init_rmode_tss(struct kvm *kvm) 2411static int init_rmode_tss(struct kvm *kvm)
2363{ 2412{
2364 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2413 gfn_t fn;
2365 u16 data = 0; 2414 u16 data = 0;
2366 int ret = 0; 2415 int r, idx, ret = 0;
2367 int r;
2368 2416
2417 idx = srcu_read_lock(&kvm->srcu);
2418 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2369 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2419 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370 if (r < 0) 2420 if (r < 0)
2371 goto out; 2421 goto out;
@@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
2389 2439
2390 ret = 1; 2440 ret = 1;
2391out: 2441out:
2442 srcu_read_unlock(&kvm->srcu, idx);
2392 return ret; 2443 return ret;
2393} 2444}
2394 2445
2395static int init_rmode_identity_map(struct kvm *kvm) 2446static int init_rmode_identity_map(struct kvm *kvm)
2396{ 2447{
2397 int i, r, ret; 2448 int i, idx, r, ret;
2398 pfn_t identity_map_pfn; 2449 pfn_t identity_map_pfn;
2399 u32 tmp; 2450 u32 tmp;
2400 2451
@@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2409 return 1; 2460 return 1;
2410 ret = 0; 2461 ret = 0;
2411 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2462 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2463 idx = srcu_read_lock(&kvm->srcu);
2412 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2464 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413 if (r < 0) 2465 if (r < 0)
2414 goto out; 2466 goto out;
@@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2424 kvm->arch.ept_identity_pagetable_done = true; 2476 kvm->arch.ept_identity_pagetable_done = true;
2425 ret = 1; 2477 ret = 1;
2426out: 2478out:
2479 srcu_read_unlock(&kvm->srcu, idx);
2427 return ret; 2480 return ret;
2428} 2481}
2429 2482
@@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2699 return 0; 2752 return 0;
2700} 2753}
2701 2754
2702static int init_rmode(struct kvm *kvm)
2703{
2704 int idx, ret = 0;
2705
2706 idx = srcu_read_lock(&kvm->srcu);
2707 if (!init_rmode_tss(kvm))
2708 goto exit;
2709 if (!init_rmode_identity_map(kvm))
2710 goto exit;
2711
2712 ret = 1;
2713exit:
2714 srcu_read_unlock(&kvm->srcu, idx);
2715 return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2755static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{ 2756{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2757 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2722 int ret; 2759 int ret;
2723 2760
2724 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2761 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725 if (!init_rmode(vmx->vcpu.kvm)) {
2726 ret = -ENOMEM;
2727 goto out;
2728 }
2729 2762
2730 vmx->rmode.vm86_active = 0; 2763 vmx->rmode.vm86_active = 0;
2731 2764
@@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2839 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2841 __pa(vmx->vcpu.arch.apic->regs));
2809 vmcs_write32(TPR_THRESHOLD, 0); 2842 vmcs_write32(TPR_THRESHOLD, 0);
2810 } 2843 }
2811 2844
@@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2971 if (ret) 3004 if (ret)
2972 return ret; 3005 return ret;
2973 kvm->arch.tss_addr = addr; 3006 kvm->arch.tss_addr = addr;
3007 if (!init_rmode_tss(kvm))
3008 return -ENOMEM;
3009
2974 return 0; 3010 return 0;
2975} 3011}
2976 3012
@@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3962#define Q "l" 3998#define Q "l"
3963#endif 3999#endif
3964 4000
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4001static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{ 4002{
3967 struct vcpu_vmx *vmx = to_vmx(vcpu); 4003 struct vcpu_vmx *vmx = to_vmx(vcpu);
3968 4004
@@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3991 asm( 4027 asm(
3992 /* Store host registers */ 4028 /* Store host registers */
3993 "push %%"R"dx; push %%"R"bp;" 4029 "push %%"R"dx; push %%"R"bp;"
4030 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3994 "push %%"R"cx \n\t" 4031 "push %%"R"cx \n\t"
3995 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4032 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996 "je 1f \n\t" 4033 "je 1f \n\t"
@@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4069 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033 ".Lkvm_vmx_return: " 4070 ".Lkvm_vmx_return: "
4034 /* Save guest registers, load host registers, keep flags */ 4071 /* Save guest registers, load host registers, keep flags */
4035 "xchg %0, (%%"R"sp) \n\t" 4072 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4073 "pop %0 \n\t"
4036 "mov %%"R"ax, %c[rax](%0) \n\t" 4074 "mov %%"R"ax, %c[rax](%0) \n\t"
4037 "mov %%"R"bx, %c[rbx](%0) \n\t" 4075 "mov %%"R"bx, %c[rbx](%0) \n\t"
4038 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4076 "pop"Q" %c[rcx](%0) \n\t"
4039 "mov %%"R"dx, %c[rdx](%0) \n\t" 4077 "mov %%"R"dx, %c[rdx](%0) \n\t"
4040 "mov %%"R"si, %c[rsi](%0) \n\t" 4078 "mov %%"R"si, %c[rsi](%0) \n\t"
4041 "mov %%"R"di, %c[rdi](%0) \n\t" 4079 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4053 "mov %%cr2, %%"R"ax \n\t" 4091 "mov %%cr2, %%"R"ax \n\t"
4054 "mov %%"R"ax, %c[cr2](%0) \n\t" 4092 "mov %%"R"ax, %c[cr2](%0) \n\t"
4055 4093
4056 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4094 "pop %%"R"bp; pop %%"R"dx \n\t"
4057 "setbe %c[fail](%0) \n\t" 4095 "setbe %c[fail](%0) \n\t"
4058 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4096 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4097 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4076 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4114 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4115 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif 4116#endif
4079 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4117 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4118 [wordsize]"i"(sizeof(ulong))
4080 : "cc", "memory" 4119 : "cc", "memory"
4081 , R"ax", R"bx", R"di", R"si" 4120 , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64 4121#ifdef CONFIG_X86_64
@@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4183 if (!kvm->arch.ept_identity_map_addr) 4222 if (!kvm->arch.ept_identity_map_addr)
4184 kvm->arch.ept_identity_map_addr = 4223 kvm->arch.ept_identity_map_addr =
4185 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4224 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4225 err = -ENOMEM;
4186 if (alloc_identity_pagetable(kvm) != 0) 4226 if (alloc_identity_pagetable(kvm) != 0)
4187 goto free_vmcs; 4227 goto free_vmcs;
4228 if (!init_rmode_identity_map(kvm))
4229 goto free_vmcs;
4188 } 4230 }
4189 4231
4190 return &vmx->vcpu; 4232 return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..934b4c6b0bf9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
81 * - enable LME and LMA per default on 64 bit KVM 81 * - enable LME and LMA per default on 64 bit KVM
82 */ 82 */
83#ifdef CONFIG_X86_64 83#ifdef CONFIG_X86_64
84static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 84static
85u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
85#else 86#else
86static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
87#endif 88#endif
88 89
89#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
360 361
361void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362void kvm_inject_nmi(struct kvm_vcpu *vcpu)
362{ 363{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
363 kvm_make_request(KVM_REQ_EVENT, vcpu); 365 kvm_make_request(KVM_REQ_EVENT, vcpu);
364 vcpu->arch.nmi_pending = 1;
365} 366}
366EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367EXPORT_SYMBOL_GPL(kvm_inject_nmi);
367 368
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
525 526
526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 kvm_x86_ops->set_cr0(vcpu, cr0);
527 528
528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
529 kvm_clear_async_pf_completion_queue(vcpu); 530 kvm_clear_async_pf_completion_queue(vcpu);
531 kvm_async_pf_hash_reset(vcpu);
532 }
530 533
531 if ((cr0 ^ old_cr0) & update_bits) 534 if ((cr0 ^ old_cr0) & update_bits)
532 kvm_mmu_reset_context(vcpu); 535 kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1017 unsigned long flags; 1020 unsigned long flags;
1018 s64 sdiff; 1021 s64 sdiff;
1019 1022
1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1021 offset = data - native_read_tsc(); 1024 offset = data - native_read_tsc();
1022 ns = get_kernel_ns(); 1025 ns = get_kernel_ns();
1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1026 elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1028,7 +1031,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1028 /* 1031 /*
1029 * Special case: close write to TSC within 5 seconds of 1032 * Special case: close write to TSC within 5 seconds of
1030 * another CPU is interpreted as an attempt to synchronize 1033 * another CPU is interpreted as an attempt to synchronize
1031 * The 5 seconds is to accomodate host load / swapping as 1034 * The 5 seconds is to accommodate host load / swapping as
1032 * well as any reset of TSC during the boot process. 1035 * well as any reset of TSC during the boot process.
1033 * 1036 *
1034 * In that case, for a reliable TSC, we can match TSC offsets, 1037 * In that case, for a reliable TSC, we can match TSC offsets,
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1050 kvm->arch.last_tsc_write = data; 1053 kvm->arch.last_tsc_write = data;
1051 kvm->arch.last_tsc_offset = offset; 1054 kvm->arch.last_tsc_offset = offset;
1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1055 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1056 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1054 1057
1055 /* Reset of TSC must disable overshoot protection below */ 1058 /* Reset of TSC must disable overshoot protection below */
1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1059 vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1453 return 0; 1456 return 0;
1454} 1457}
1455 1458
1459static void kvmclock_reset(struct kvm_vcpu *vcpu)
1460{
1461 if (vcpu->arch.time_page) {
1462 kvm_release_page_dirty(vcpu->arch.time_page);
1463 vcpu->arch.time_page = NULL;
1464 }
1465}
1466
1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1467int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1457{ 1468{
1458 switch (msr) { 1469 switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1510 break; 1521 break;
1511 case MSR_KVM_SYSTEM_TIME_NEW: 1522 case MSR_KVM_SYSTEM_TIME_NEW:
1512 case MSR_KVM_SYSTEM_TIME: { 1523 case MSR_KVM_SYSTEM_TIME: {
1513 if (vcpu->arch.time_page) { 1524 kvmclock_reset(vcpu);
1514 kvm_release_page_dirty(vcpu->arch.time_page);
1515 vcpu->arch.time_page = NULL;
1516 }
1517 1525
1518 vcpu->arch.time = data; 1526 vcpu->arch.time = data;
1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1527 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1592 } else 1600 } else
1593 return set_msr_hyperv(vcpu, msr, data); 1601 return set_msr_hyperv(vcpu, msr, data);
1594 break; 1602 break;
1603 case MSR_IA32_BBL_CR_CTL3:
1604 /* Drop writes to this legacy MSR -- see rdmsr
1605 * counterpart for further detail.
1606 */
1607 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1608 break;
1595 default: 1609 default:
1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1610 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1597 return xen_hvm_config(vcpu, data); 1611 return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 } else 1860 } else
1847 return get_msr_hyperv(vcpu, msr, pdata); 1861 return get_msr_hyperv(vcpu, msr, pdata);
1848 break; 1862 break;
1863 case MSR_IA32_BBL_CR_CTL3:
1864 /* This legacy MSR exists but isn't fully documented in current
1865 * silicon. It is however accessed by winxp in very narrow
1866 * scenarios where it sets bit #19, itself documented as
1867 * a "reserved" bit. Best effort attempt to source coherent
1868 * read data here should the balance of the register be
1869 * interpreted by the guest:
1870 *
1871 * L2 cache control register 3: 64GB range, 256KB size,
1872 * enabled, latency 0x1, configured
1873 */
1874 data = 0xbe702111;
1875 break;
1849 default: 1876 default:
1850 if (!ignore_msrs) { 1877 if (!ignore_msrs) {
1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2100 if (check_tsc_unstable()) { 2127 if (check_tsc_unstable()) {
2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2128 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2102 vcpu->arch.tsc_catchup = 1; 2129 vcpu->arch.tsc_catchup = 1;
2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2104 } 2130 }
2131 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2105 if (vcpu->cpu != cpu) 2132 if (vcpu->cpu != cpu)
2106 kvm_migrate_timers(vcpu); 2133 kvm_migrate_timers(vcpu);
2107 vcpu->cpu = cpu; 2134 vcpu->cpu = cpu;
@@ -2368,9 +2395,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2368 int i; 2395 int i;
2369 2396
2370 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2397 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2371 for (i = 1; *nent < maxnent; ++i) { 2398 for (i = 1; *nent < maxnent && i < 64; ++i) {
2372 if (entry[i - 1].eax == 0 && i != 2) 2399 if (entry[i].eax == 0)
2373 break; 2400 continue;
2374 do_cpuid_1_ent(&entry[i], function, i); 2401 do_cpuid_1_ent(&entry[i], function, i);
2375 entry[i].flags |= 2402 entry[i].flags |=
2376 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2403 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2575 if (mce->status & MCI_STATUS_UC) { 2602 if (mce->status & MCI_STATUS_UC) {
2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2603 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2604 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2578 printk(KERN_DEBUG "kvm: set_mce: "
2579 "injects mce exception while "
2580 "previous one is in progress!\n");
2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2605 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2582 return 0; 2606 return 0;
2583 } 2607 }
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2672 vcpu->arch.interrupt.pending = events->interrupt.injected;
2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2673 vcpu->arch.interrupt.nr = events->interrupt.nr;
2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2674 vcpu->arch.interrupt.soft = events->interrupt.soft;
2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2652 kvm_pic_clear_isr_ack(vcpu->kvm);
2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2675 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2676 kvm_x86_ops->set_interrupt_shadow(vcpu,
2655 events->interrupt.shadow); 2677 events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
4140 return get_segment_base(vcpu, seg); 4162 return get_segment_base(vcpu, seg);
4141} 4163}
4142 4164
4143static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4144 struct kvm_vcpu *vcpu) 4166 int seg, struct kvm_vcpu *vcpu)
4145{ 4167{
4146 struct kvm_segment var; 4168 struct kvm_segment var;
4147 4169
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4154 var.limit >>= 12; 4176 var.limit >>= 12;
4155 set_desc_limit(desc, var.limit); 4177 set_desc_limit(desc, var.limit);
4156 set_desc_base(desc, (unsigned long)var.base); 4178 set_desc_base(desc, (unsigned long)var.base);
4179#ifdef CONFIG_X86_64
4180 if (base3)
4181 *base3 = var.base >> 32;
4182#endif
4157 desc->type = var.type; 4183 desc->type = var.type;
4158 desc->s = var.s; 4184 desc->s = var.s;
4159 desc->dpl = var.dpl; 4185 desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4166 return true; 4192 return true;
4167} 4193}
4168 4194
4169static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4170 struct kvm_vcpu *vcpu) 4196 int seg, struct kvm_vcpu *vcpu)
4171{ 4197{
4172 struct kvm_segment var; 4198 struct kvm_segment var;
4173 4199
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
4175 kvm_get_segment(vcpu, &var, seg); 4201 kvm_get_segment(vcpu, &var, seg);
4176 4202
4177 var.base = get_desc_base(desc); 4203 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32;
4206#endif
4178 var.limit = get_desc_limit(desc); 4207 var.limit = get_desc_limit(desc);
4179 if (desc->g) 4208 if (desc->g)
4180 var.limit = (var.limit << 12) | 0xfff; 4209 var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4390 vcpu->arch.emulate_ctxt.have_exception = false; 4419 vcpu->arch.emulate_ctxt.have_exception = false;
4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4420 vcpu->arch.emulate_ctxt.perm_ok = false;
4392 4421
4422 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4423 = emulation_type & EMULTYPE_TRAP_UD;
4424
4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4425 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4394 if (r == X86EMUL_PROPAGATE_FAULT)
4395 goto done;
4396 4426
4397 trace_kvm_emulate_insn_start(vcpu); 4427 trace_kvm_emulate_insn_start(vcpu);
4398
4399 /* Only allow emulation of specific instructions on #UD
4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/
4401 if (emulation_type & EMULTYPE_TRAP_UD) {
4402 if (!c->twobyte)
4403 return EMULATE_FAIL;
4404 switch (c->b) {
4405 case 0x01: /* VMMCALL */
4406 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4407 return EMULATE_FAIL;
4408 break;
4409 case 0x34: /* sysenter */
4410 case 0x35: /* sysexit */
4411 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4412 return EMULATE_FAIL;
4413 break;
4414 case 0x05: /* syscall */
4415 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4416 return EMULATE_FAIL;
4417 break;
4418 default:
4419 return EMULATE_FAIL;
4420 }
4421
4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
4423 return EMULATE_FAIL;
4424 }
4425
4426 ++vcpu->stat.insn_emulation; 4428 ++vcpu->stat.insn_emulation;
4427 if (r) { 4429 if (r) {
4430 if (emulation_type & EMULTYPE_TRAP_UD)
4431 return EMULATE_FAIL;
4428 if (reexecute_instruction(vcpu, cr2)) 4432 if (reexecute_instruction(vcpu, cr2))
4429 return EMULATE_DONE; 4433 return EMULATE_DONE;
4430 if (emulation_type & EMULTYPE_SKIP) 4434 if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
4452 return handle_emulation_failure(vcpu); 4456 return handle_emulation_failure(vcpu);
4453 } 4457 }
4454 4458
4455done:
4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4459 if (vcpu->arch.emulate_ctxt.have_exception) {
4457 inject_emulated_exception(vcpu); 4460 inject_emulated_exception(vcpu);
4458 r = EMULATE_DONE; 4461 r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4562 4565
4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4566 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4564 4567
4565 spin_lock(&kvm_lock); 4568 raw_spin_lock(&kvm_lock);
4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4569 list_for_each_entry(kvm, &vm_list, vm_list) {
4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4570 kvm_for_each_vcpu(i, vcpu, kvm) {
4568 if (vcpu->cpu != freq->cpu) 4571 if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4572 send_ipi = 1; 4575 send_ipi = 1;
4573 } 4576 }
4574 } 4577 }
4575 spin_unlock(&kvm_lock); 4578 raw_spin_unlock(&kvm_lock);
4576 4579
4577 if (freq->old < freq->new && send_ipi) { 4580 if (freq->old < freq->new && send_ipi) {
4578 /* 4581 /*
@@ -4955,12 +4958,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
4955 best = e; 4958 best = e;
4956 break; 4959 break;
4957 } 4960 }
4958 /*
4959 * Both basic or both extended?
4960 */
4961 if (((e->function ^ function) & 0x80000000) == 0)
4962 if (!best || e->function > best->function)
4963 best = e;
4964 } 4961 }
4965 return best; 4962 return best;
4966} 4963}
@@ -4980,6 +4977,27 @@ not_found:
4980 return 36; 4977 return 36;
4981} 4978}
4982 4979
4980/*
4981 * If no match is found, check whether we exceed the vCPU's limit
4982 * and return the content of the highest valid _standard_ leaf instead.
4983 * This is to satisfy the CPUID specification.
4984 */
4985static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
4986 u32 function, u32 index)
4987{
4988 struct kvm_cpuid_entry2 *maxlevel;
4989
4990 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
4991 if (!maxlevel || maxlevel->eax >= function)
4992 return NULL;
4993 if (function & 0x80000000) {
4994 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
4995 if (!maxlevel)
4996 return NULL;
4997 }
4998 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
4999}
5000
4983void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 5001void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4984{ 5002{
4985 u32 function, index; 5003 u32 function, index;
@@ -4992,6 +5010,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4992 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 5010 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
4993 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 5011 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
4994 best = kvm_find_cpuid_entry(vcpu, function, index); 5012 best = kvm_find_cpuid_entry(vcpu, function, index);
5013
5014 if (!best)
5015 best = check_cpuid_limit(vcpu, function, index);
5016
4995 if (best) { 5017 if (best) {
4996 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 5018 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
4997 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 5019 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -5185,6 +5207,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5185 r = 1; 5207 r = 1;
5186 goto out; 5208 goto out;
5187 } 5209 }
5210 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5211 vcpu->arch.nmi_pending = true;
5188 } 5212 }
5189 5213
5190 r = kvm_mmu_reload(vcpu); 5214 r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5237,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5213 kvm_load_guest_fpu(vcpu); 5237 kvm_load_guest_fpu(vcpu);
5214 kvm_load_guest_xcr0(vcpu); 5238 kvm_load_guest_xcr0(vcpu);
5215 5239
5216 atomic_set(&vcpu->guest_mode, 1); 5240 vcpu->mode = IN_GUEST_MODE;
5217 smp_wmb(); 5241
5242 /* We should set ->mode before check ->requests,
5243 * see the comment in make_all_cpus_request.
5244 */
5245 smp_mb();
5218 5246
5219 local_irq_disable(); 5247 local_irq_disable();
5220 5248
5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5249 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5222 || need_resched() || signal_pending(current)) { 5250 || need_resched() || signal_pending(current)) {
5223 atomic_set(&vcpu->guest_mode, 0); 5251 vcpu->mode = OUTSIDE_GUEST_MODE;
5224 smp_wmb(); 5252 smp_wmb();
5225 local_irq_enable(); 5253 local_irq_enable();
5226 preempt_enable(); 5254 preempt_enable();
@@ -5256,7 +5284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5256 5284
5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5285 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5258 5286
5259 atomic_set(&vcpu->guest_mode, 0); 5287 vcpu->mode = OUTSIDE_GUEST_MODE;
5260 smp_wmb(); 5288 smp_wmb();
5261 local_irq_enable(); 5289 local_irq_enable();
5262 5290
@@ -5574,7 +5602,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5574 struct kvm_sregs *sregs) 5602 struct kvm_sregs *sregs)
5575{ 5603{
5576 int mmu_reset_needed = 0; 5604 int mmu_reset_needed = 0;
5577 int pending_vec, max_bits; 5605 int pending_vec, max_bits, idx;
5578 struct desc_ptr dt; 5606 struct desc_ptr dt;
5579 5607
5580 dt.size = sregs->idt.limit; 5608 dt.size = sregs->idt.limit;
@@ -5603,10 +5631,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5631 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5632 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu); 5633 update_cpuid(vcpu);
5634
5635 idx = srcu_read_lock(&vcpu->kvm->srcu);
5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5636 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5637 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5608 mmu_reset_needed = 1; 5638 mmu_reset_needed = 1;
5609 } 5639 }
5640 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5610 5641
5611 if (mmu_reset_needed) 5642 if (mmu_reset_needed)
5612 kvm_mmu_reset_context(vcpu); 5643 kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5648,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5617 if (pending_vec < max_bits) { 5648 if (pending_vec < max_bits) {
5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5649 kvm_queue_interrupt(vcpu, pending_vec, false);
5619 pr_debug("Set back pending irq %d\n", pending_vec); 5650 pr_debug("Set back pending irq %d\n", pending_vec);
5620 if (irqchip_in_kernel(vcpu->kvm))
5621 kvm_pic_clear_isr_ack(vcpu->kvm);
5622 } 5651 }
5623 5652
5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5653 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5843,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5814 5843
5815void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5844void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5816{ 5845{
5817 if (vcpu->arch.time_page) { 5846 kvmclock_reset(vcpu);
5818 kvm_release_page_dirty(vcpu->arch.time_page);
5819 vcpu->arch.time_page = NULL;
5820 }
5821 5847
5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5848 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5823 fx_free(vcpu); 5849 fx_free(vcpu);
@@ -5878,6 +5904,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5904 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0; 5905 vcpu->arch.apf.msr_val = 0;
5880 5906
5907 kvmclock_reset(vcpu);
5908
5881 kvm_clear_async_pf_completion_queue(vcpu); 5909 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu); 5910 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false; 5911 vcpu->arch.apf.halted = false;
@@ -6005,7 +6033,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6033 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6034 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6007 6035
6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6036 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6009 6037
6010 return 0; 6038 return 0;
6011} 6039}
@@ -6103,7 +6131,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6103 int user_alloc) 6131 int user_alloc)
6104{ 6132{
6105 6133
6106 int npages = mem->memory_size >> PAGE_SHIFT; 6134 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6107 6135
6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6136 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6109 int ret; 6137 int ret;
@@ -6118,12 +6146,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6118 "failed to munmap memory\n"); 6146 "failed to munmap memory\n");
6119 } 6147 }
6120 6148
6149 if (!kvm->arch.n_requested_mmu_pages)
6150 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6151
6121 spin_lock(&kvm->mmu_lock); 6152 spin_lock(&kvm->mmu_lock);
6122 if (!kvm->arch.n_requested_mmu_pages) { 6153 if (nr_mmu_pages)
6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6154 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6125 }
6126
6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6155 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6128 spin_unlock(&kvm->mmu_lock); 6156 spin_unlock(&kvm->mmu_lock);
6129} 6157}
@@ -6157,7 +6185,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6157 6185
6158 me = get_cpu(); 6186 me = get_cpu();
6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6187 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6188 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6161 smp_send_reschedule(cpu); 6189 smp_send_reschedule(cpu);
6162 put_cpu(); 6190 put_cpu();
6163} 6191}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5b96fd95bdab..4e0068ead6b4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void)
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 397 * instead we just use the real "cpuid" instruction. Then I pretty much turned
398 * off feature bits until the Guest booted. (Don't say that: you'll damage 398 * off feature bits until the Guest booted. (Don't say that: you'll damage
399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
400 * hardly future proof.) Noone's listening! They don't like you anyway, 400 * hardly future proof.) No one's listening! They don't like you anyway,
401 * parenthetic weirdo! 401 * parenthetic weirdo!
402 * 402 *
403 * Replacing the cpuid so we can turn features off is great for the kernel, but 403 * Replacing the cpuid so we can turn features off is great for the kernel, but
@@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void)
847void lguest_setup_irq(unsigned int irq) 847void lguest_setup_irq(unsigned int irq)
848{ 848{
849 irq_alloc_desc_at(irq, 0); 849 irq_alloc_desc_at(irq, 0);
850 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 850 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
851 handle_level_irq, "level"); 851 handle_level_irq, "level");
852} 852}
853 853
@@ -993,7 +993,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
993static void lguest_time_init(void) 993static void lguest_time_init(void)
994{ 994{
995 /* Set up the timer interrupt (0) to go to our simple timer routine */ 995 /* Set up the timer interrupt (0) to go to our simple timer routine */
996 set_irq_handler(0, lguest_time_irq); 996 irq_set_handler(0, lguest_time_irq);
997 997
998 clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); 998 clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
999 999
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
42 lib-y += memmove_64.o memset_64.o 42 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o 44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o
45endif 46endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e65..e8e7e0d06f42 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
15 15
16/* if you want SMP support, implement these with real spinlocks */ 16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg 17.macro LOCK reg
18 pushfl 18 pushfl_cfi
19 CFI_ADJUST_CFA_OFFSET 4
20 cli 19 cli
21.endm 20.endm
22 21
23.macro UNLOCK reg 22.macro UNLOCK reg
24 popfl 23 popfl_cfi
25 CFI_ADJUST_CFA_OFFSET -4
26.endm 24.endm
27 25
28#define BEGIN(op) \ 26#define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de3352..391a083674b4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg 16.macro SAVE reg
17 pushl %\reg 17 pushl_cfi %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0 18 CFI_REL_OFFSET \reg, 0
20.endm 19.endm
21 20
22.macro RESTORE reg 21.macro RESTORE reg
23 popl %\reg 22 popl_cfi %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg 23 CFI_RESTORE \reg
26.endm 24.endm
27 25
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..78d16a554db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
50 */ 50 */
51ENTRY(csum_partial) 51ENTRY(csum_partial)
52 CFI_STARTPROC 52 CFI_STARTPROC
53 pushl %esi 53 pushl_cfi %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0 54 CFI_REL_OFFSET esi, 0
56 pushl %ebx 55 pushl_cfi %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0 56 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum 57 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len 58 movl 16(%esp),%ecx # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
132 jz 8f 130 jz 8f
133 roll $8, %eax 131 roll $8, %eax
1348: 1328:
135 popl %ebx 133 popl_cfi %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx 134 CFI_RESTORE ebx
138 popl %esi 135 popl_cfi %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi 136 CFI_RESTORE esi
141 ret 137 ret
142 CFI_ENDPROC 138 CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
148 144
149ENTRY(csum_partial) 145ENTRY(csum_partial)
150 CFI_STARTPROC 146 CFI_STARTPROC
151 pushl %esi 147 pushl_cfi %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0 148 CFI_REL_OFFSET esi, 0
154 pushl %ebx 149 pushl_cfi %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0 150 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum 151 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len 152 movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
260 jz 90f 254 jz 90f
261 roll $8, %eax 255 roll $8, %eax
26290: 25690:
263 popl %ebx 257 popl_cfi %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx 258 CFI_RESTORE ebx
266 popl %esi 259 popl_cfi %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi 260 CFI_RESTORE esi
269 ret 261 ret
270 CFI_ENDPROC 262 CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC 301 CFI_STARTPROC
310 subl $4,%esp 302 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4 303 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi 304 pushl_cfi %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0 305 CFI_REL_OFFSET edi, 0
315 pushl %esi 306 pushl_cfi %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0 307 CFI_REL_OFFSET esi, 0
318 pushl %ebx 308 pushl_cfi %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0 309 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum 310 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len 311 movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) )
426 415
427.previous 416.previous
428 417
429 popl %ebx 418 popl_cfi %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx 419 CFI_RESTORE ebx
432 popl %esi 420 popl_cfi %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi 421 CFI_RESTORE esi
435 popl %edi 422 popl_cfi %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi 423 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp 424 popl_cfi %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret 425 ret
441 CFI_ENDPROC 426 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic) 427ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
459 444
460ENTRY(csum_partial_copy_generic) 445ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC 446 CFI_STARTPROC
462 pushl %ebx 447 pushl_cfi %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0 448 CFI_REL_OFFSET ebx, 0
465 pushl %edi 449 pushl_cfi %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0 450 CFI_REL_OFFSET edi, 0
468 pushl %esi 451 pushl_cfi %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0 452 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src 453 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst 454 movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) )
527 jmp 7b 509 jmp 7b
528.previous 510.previous
529 511
530 popl %esi 512 popl_cfi %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi 513 CFI_RESTORE esi
533 popl %edi 514 popl_cfi %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi 515 CFI_RESTORE edi
536 popl %ebx 516 popl_cfi %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx 517 CFI_RESTORE ebx
539 ret 518 ret
540 CFI_ENDPROC 519 CFI_ENDPROC
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..1e572c507d06
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8#include <linux/linkage.h>
9#include <asm/alternative-asm.h>
10#include <asm/frame.h>
11#include <asm/dwarf2.h>
12
13#ifdef CONFIG_SMP
14#define SEG_PREFIX %gs:
15#else
16#define SEG_PREFIX
17#endif
18
19.text
20
21/*
22 * Inputs:
23 * %rsi : memory location to compare
24 * %rax : low 64 bits of old value
25 * %rdx : high 64 bits of old value
26 * %rbx : low 64 bits of new value
27 * %rcx : high 64 bits of new value
28 * %al : Operation successful
29 */
30ENTRY(this_cpu_cmpxchg16b_emu)
31CFI_STARTPROC
32
33#
34# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
35# via the ZF. Caller will access %al to get result.
36#
37# Note that this is only useful for a cpuops operation. Meaning that we
38# do *not* have a fully atomic operation but just an operation that is
39# *atomic* on a single cpu (as provided by the this_cpu_xx class of
40# macros).
41#
42this_cpu_cmpxchg16b_emu:
43 pushf
44 cli
45
46 cmpq SEG_PREFIX(%rsi), %rax
47 jne not_same
48 cmpq SEG_PREFIX 8(%rsi), %rdx
49 jne not_same
50
51 movq %rbx, SEG_PREFIX(%rsi)
52 movq %rcx, SEG_PREFIX 8(%rsi)
53
54 popf
55 mov $1, %al
56 ret
57
58 not_same:
59 popf
60 xor %al,%al
61 ret
62
63CFI_ENDPROC
64
65ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..99e482615195 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -117,7 +117,7 @@ ENDPROC(bad_from_user)
117 * rdx count 117 * rdx count
118 * 118 *
119 * Output: 119 * Output:
120 * eax uncopied bytes or 0 if successfull. 120 * eax uncopied bytes or 0 if successful.
121 */ 121 */
122ENTRY(copy_user_generic_unrolled) 122ENTRY(copy_user_generic_unrolled)
123 CFI_STARTPROC 123 CFI_STARTPROC
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 * 3 *
4 * This file is subject to the terms and conditions of the GNU General Public 4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
11 11
12/* 12/*
13 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed. 15 * destination is zeroed.
16 * 16 *
17 * Input 17 * Input
18 * rdi source 18 * rdi source
19 * rsi destination 19 * rsi destination
20 * edx len (32bit) 20 * edx len (32bit)
21 * ecx sum (32bit) 21 * ecx sum (32bit)
22 * r8 src_err_ptr (int) 22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int) 23 * r9 dst_err_ptr (int)
24 * 24 *
25 * Output 25 * Output
26 * eax 64bit sum. undefined in case of exception. 26 * eax 64bit sum. undefined in case of exception.
27 * 27 *
28 * Wrappers need to take care of valid exception sum and zeroing. 28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes. 29 * They also should align source or destination to 8 bytes.
30 */ 30 */
31 31
32 .macro source 32 .macro source
3310: 3310:
34 .section __ex_table,"a" 34 .section __ex_table, "a"
35 .align 8 35 .align 8
36 .quad 10b,.Lbad_source 36 .quad 10b, .Lbad_source
37 .previous 37 .previous
38 .endm 38 .endm
39 39
40 .macro dest 40 .macro dest
4120: 4120:
42 .section __ex_table,"a" 42 .section __ex_table, "a"
43 .align 8 43 .align 8
44 .quad 20b,.Lbad_dest 44 .quad 20b, .Lbad_dest
45 .previous 45 .previous
46 .endm 46 .endm
47 47
48 .macro ignore L=.Lignore 48 .macro ignore L=.Lignore
4930: 4930:
50 .section __ex_table,"a" 50 .section __ex_table, "a"
51 .align 8 51 .align 8
52 .quad 30b,\L 52 .quad 30b, \L
53 .previous 53 .previous
54 .endm 54 .endm
55 55
56 56
57ENTRY(csum_partial_copy_generic) 57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC 58 CFI_STARTPROC
59 cmpl $3*64,%edx 59 cmpl $3*64, %edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8, %rsp
64 CFI_ADJUST_CFA_OFFSET 7*8 64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp) 65 movq %rbx, 2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8 66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp) 67 movq %r12, 3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8 68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp) 69 movq %r14, 4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8 70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp) 71 movq %r13, 5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8 72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp) 73 movq %rbp, 6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8 74 CFI_REL_OFFSET rbp, 6*8
75 75
76 movq %r8,(%rsp) 76 movq %r8, (%rsp)
77 movq %r9,1*8(%rsp) 77 movq %r9, 1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81 78
82 xorl %r9d,%r9d 79 movl %ecx, %eax
83 movq %rcx,%r12 80 movl %edx, %ecx
84 81
85 shrq $6,%r12 82 xorl %r9d, %r9d
86 jz .Lhandle_tail /* < 64 */ 83 movq %rcx, %r12
84
85 shrq $6, %r12
86 jz .Lhandle_tail /* < 64 */
87 87
88 clc 88 clc
89 89
90 /* main loop. clear in 64 byte blocks */ 90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ 91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */ 92 /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
94 .p2align 4 94 .p2align 4
95.Lloop: 95.Lloop:
96 source 96 source
97 movq (%rdi),%rbx 97 movq (%rdi), %rbx
98 source 98 source
99 movq 8(%rdi),%r8 99 movq 8(%rdi), %r8
100 source 100 source
101 movq 16(%rdi),%r11 101 movq 16(%rdi), %r11
102 source 102 source
103 movq 24(%rdi),%rdx 103 movq 24(%rdi), %rdx
104 104
105 source 105 source
106 movq 32(%rdi),%r10 106 movq 32(%rdi), %r10
107 source 107 source
108 movq 40(%rdi),%rbp 108 movq 40(%rdi), %rbp
109 source 109 source
110 movq 48(%rdi),%r14 110 movq 48(%rdi), %r14
111 source 111 source
112 movq 56(%rdi),%r13 112 movq 56(%rdi), %r13
113 113
114 ignore 2f 114 ignore 2f
115 prefetcht0 5*64(%rdi) 115 prefetcht0 5*64(%rdi)
1162: 1162:
117 adcq %rbx,%rax 117 adcq %rbx, %rax
118 adcq %r8,%rax 118 adcq %r8, %rax
119 adcq %r11,%rax 119 adcq %r11, %rax
120 adcq %rdx,%rax 120 adcq %rdx, %rax
121 adcq %r10,%rax 121 adcq %r10, %rax
122 adcq %rbp,%rax 122 adcq %rbp, %rax
123 adcq %r14,%rax 123 adcq %r14, %rax
124 adcq %r13,%rax 124 adcq %r13, %rax
125 125
126 decl %r12d 126 decl %r12d
127 127
128 dest 128 dest
129 movq %rbx,(%rsi) 129 movq %rbx, (%rsi)
130 dest 130 dest
131 movq %r8,8(%rsi) 131 movq %r8, 8(%rsi)
132 dest 132 dest
133 movq %r11,16(%rsi) 133 movq %r11, 16(%rsi)
134 dest 134 dest
135 movq %rdx,24(%rsi) 135 movq %rdx, 24(%rsi)
136 136
137 dest 137 dest
138 movq %r10,32(%rsi) 138 movq %r10, 32(%rsi)
139 dest 139 dest
140 movq %rbp,40(%rsi) 140 movq %rbp, 40(%rsi)
141 dest 141 dest
142 movq %r14,48(%rsi) 142 movq %r14, 48(%rsi)
143 dest 143 dest
144 movq %r13,56(%rsi) 144 movq %r13, 56(%rsi)
145 145
1463: 1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150 147
151 jnz .Lloop 148 leaq 64(%rdi), %rdi
149 leaq 64(%rsi), %rsi
152 150
153 adcq %r9,%rax 151 jnz .Lloop
154 152
155 /* do last upto 56 bytes */ 153 adcq %r9, %rax
154
155 /* do last up to 56 bytes */
156.Lhandle_tail: 156.Lhandle_tail:
157 /* ecx: count */ 157 /* ecx: count */
158 movl %ecx,%r10d 158 movl %ecx, %r10d
159 andl $63,%ecx 159 andl $63, %ecx
160 shrl $3,%ecx 160 shrl $3, %ecx
161 jz .Lfold 161 jz .Lfold
162 clc 162 clc
163 .p2align 4 163 .p2align 4
164.Lloop_8: 164.Lloop_8:
165 source 165 source
166 movq (%rdi),%rbx 166 movq (%rdi), %rbx
167 adcq %rbx,%rax 167 adcq %rbx, %rax
168 decl %ecx 168 decl %ecx
169 dest 169 dest
170 movq %rbx,(%rsi) 170 movq %rbx, (%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */ 171 leaq 8(%rsi), %rsi /* preserve carry */
172 leaq 8(%rdi),%rdi 172 leaq 8(%rdi), %rdi
173 jnz .Lloop_8 173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */ 174 adcq %r9, %rax /* add in carry */
175 175
176.Lfold: 176.Lfold:
177 /* reduce checksum to 32bits */ 177 /* reduce checksum to 32bits */
178 movl %eax,%ebx 178 movl %eax, %ebx
179 shrq $32,%rax 179 shrq $32, %rax
180 addl %ebx,%eax 180 addl %ebx, %eax
181 adcl %r9d,%eax 181 adcl %r9d, %eax
182 182
183 /* do last upto 6 bytes */ 183 /* do last up to 6 bytes */
184.Lhandle_7: 184.Lhandle_7:
185 movl %r10d,%ecx 185 movl %r10d, %ecx
186 andl $7,%ecx 186 andl $7, %ecx
187 shrl $1,%ecx 187 shrl $1, %ecx
188 jz .Lhandle_1 188 jz .Lhandle_1
189 movl $2,%edx 189 movl $2, %edx
190 xorl %ebx,%ebx 190 xorl %ebx, %ebx
191 clc 191 clc
192 .p2align 4 192 .p2align 4
193.Lloop_1: 193.Lloop_1:
194 source 194 source
195 movw (%rdi),%bx 195 movw (%rdi), %bx
196 adcl %ebx,%eax 196 adcl %ebx, %eax
197 decl %ecx 197 decl %ecx
198 dest 198 dest
199 movw %bx,(%rsi) 199 movw %bx, (%rsi)
200 leaq 2(%rdi),%rdi 200 leaq 2(%rdi), %rdi
201 leaq 2(%rsi),%rsi 201 leaq 2(%rsi), %rsi
202 jnz .Lloop_1 202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */ 203 adcl %r9d, %eax /* add in carry */
204 204
205 /* handle last odd byte */ 205 /* handle last odd byte */
206.Lhandle_1: 206.Lhandle_1:
207 testl $1,%r10d 207 testl $1, %r10d
208 jz .Lende 208 jz .Lende
209 xorl %ebx,%ebx 209 xorl %ebx, %ebx
210 source 210 source
211 movb (%rdi),%bl 211 movb (%rdi), %bl
212 dest 212 dest
213 movb %bl,(%rsi) 213 movb %bl, (%rsi)
214 addl %ebx,%eax 214 addl %ebx, %eax
215 adcl %r9d,%eax /* carry */ 215 adcl %r9d, %eax /* carry */
216 216
217 CFI_REMEMBER_STATE 217 CFI_REMEMBER_STATE
218.Lende: 218.Lende:
219 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp), %rbx
220 CFI_RESTORE rbx 220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp), %r12
222 CFI_RESTORE r12 222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp), %r14
224 CFI_RESTORE r14 224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp), %r13
226 CFI_RESTORE r13 226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp), %rbp
228 CFI_RESTORE rbp 228 CFI_RESTORE rbp
229 addq $7*8,%rsp 229 addq $7*8, %rsp
230 CFI_ADJUST_CFA_OFFSET -7*8 230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret 231 ret
232 CFI_RESTORE_STATE 232 CFI_RESTORE_STATE
233 233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source: 235.Lbad_source:
236 movq (%rsp),%rax 236 movq (%rsp), %rax
237 testq %rax,%rax 237 testq %rax, %rax
238 jz .Lende 238 jz .Lende
239 movl $-EFAULT,(%rax) 239 movl $-EFAULT, (%rax)
240 jmp .Lende 240 jmp .Lende
241 241
242.Lbad_dest: 242.Lbad_dest:
243 movq 8(%rsp),%rax 243 movq 8(%rsp), %rax
244 testq %rax,%rax 244 testq %rax, %rax
245 jz .Lende 245 jz .Lende
246 movl $-EFAULT,(%rax) 246 movl $-EFAULT, (%rax)
247 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC 248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic) 249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
84 count64--; 84 count64--;
85 } 85 }
86 86
87 /* last upto 7 8byte blocks */ 87 /* last up to 7 8byte blocks */
88 count %= 8; 88 count %= 8;
89 while (count) { 89 while (count) {
90 asm("addq %1,%0\n\t" 90 asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..0ecb8433e5a8
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
12#undef memmove
13
14/*
15 * Implement memmove(). This can handle overlap between src and dst.
16 *
17 * Input:
18 * rdi: dest
19 * rsi: src
20 * rdx: count
21 *
22 * Output:
23 * rax: dest
24 */
25ENTRY(memmove)
26 CFI_STARTPROC
27 /* Handle more 32bytes in loop */
28 mov %rdi, %rax
29 cmp $0x20, %rdx
30 jb 1f
31
32 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi
34 jb 2f
35
36 /*
37 * movsq instruction have many startup latency
38 * so we handle small size by general register.
39 */
40 cmp $680, %rdx
41 jb 3f
42 /*
43 * movsq instruction is only good for aligned case.
44 */
45
46 cmpb %dil, %sil
47 je 4f
483:
49 sub $0x20, %rdx
50 /*
51 * We gobble 32byts forward in each loop.
52 */
535:
54 sub $0x20, %rdx
55 movq 0*8(%rsi), %r11
56 movq 1*8(%rsi), %r10
57 movq 2*8(%rsi), %r9
58 movq 3*8(%rsi), %r8
59 leaq 4*8(%rsi), %rsi
60
61 movq %r11, 0*8(%rdi)
62 movq %r10, 1*8(%rdi)
63 movq %r9, 2*8(%rdi)
64 movq %r8, 3*8(%rdi)
65 leaq 4*8(%rdi), %rdi
66 jae 5b
67 addq $0x20, %rdx
68 jmp 1f
69 /*
70 * Handle data forward by movsq.
71 */
72 .p2align 4
734:
74 movq %rdx, %rcx
75 movq -8(%rsi, %rdx), %r11
76 lea -8(%rdi, %rdx), %r10
77 shrq $3, %rcx
78 rep movsq
79 movq %r11, (%r10)
80 jmp 13f
81 /*
82 * Handle data backward by movsq.
83 */
84 .p2align 4
857:
86 movq %rdx, %rcx
87 movq (%rsi), %r11
88 movq %rdi, %r10
89 leaq -8(%rsi, %rdx), %rsi
90 leaq -8(%rdi, %rdx), %rdi
91 shrq $3, %rcx
92 std
93 rep movsq
94 cld
95 movq %r11, (%r10)
96 jmp 13f
97
98 /*
99 * Start to prepare for backward copy.
100 */
101 .p2align 4
1022:
103 cmp $680, %rdx
104 jb 6f
105 cmp %dil, %sil
106 je 7b
1076:
108 /*
109 * Calculate copy position to tail.
110 */
111 addq %rdx, %rsi
112 addq %rdx, %rdi
113 subq $0x20, %rdx
114 /*
115 * We gobble 32byts backward in each loop.
116 */
1178:
118 subq $0x20, %rdx
119 movq -1*8(%rsi), %r11
120 movq -2*8(%rsi), %r10
121 movq -3*8(%rsi), %r9
122 movq -4*8(%rsi), %r8
123 leaq -4*8(%rsi), %rsi
124
125 movq %r11, -1*8(%rdi)
126 movq %r10, -2*8(%rdi)
127 movq %r9, -3*8(%rdi)
128 movq %r8, -4*8(%rdi)
129 leaq -4*8(%rdi), %rdi
130 jae 8b
131 /*
132 * Calculate copy position to head.
133 */
134 addq $0x20, %rdx
135 subq %rdx, %rsi
136 subq %rdx, %rdi
1371:
138 cmpq $16, %rdx
139 jb 9f
140 /*
141 * Move data from 16 bytes to 31 bytes.
142 */
143 movq 0*8(%rsi), %r11
144 movq 1*8(%rsi), %r10
145 movq -2*8(%rsi, %rdx), %r9
146 movq -1*8(%rsi, %rdx), %r8
147 movq %r11, 0*8(%rdi)
148 movq %r10, 1*8(%rdi)
149 movq %r9, -2*8(%rdi, %rdx)
150 movq %r8, -1*8(%rdi, %rdx)
151 jmp 13f
152 .p2align 4
1539:
154 cmpq $8, %rdx
155 jb 10f
156 /*
157 * Move data from 8 bytes to 15 bytes.
158 */
159 movq 0*8(%rsi), %r11
160 movq -1*8(%rsi, %rdx), %r10
161 movq %r11, 0*8(%rdi)
162 movq %r10, -1*8(%rdi, %rdx)
163 jmp 13f
16410:
165 cmpq $4, %rdx
166 jb 11f
167 /*
168 * Move data from 4 bytes to 7 bytes.
169 */
170 movl (%rsi), %r11d
171 movl -4(%rsi, %rdx), %r10d
172 movl %r11d, (%rdi)
173 movl %r10d, -4(%rdi, %rdx)
174 jmp 13f
17511:
176 cmp $2, %rdx
177 jb 12f
178 /*
179 * Move data from 2 bytes to 3 bytes.
180 */
181 movw (%rsi), %r11w
182 movw -2(%rsi, %rdx), %r10w
183 movw %r11w, (%rdi)
184 movw %r10w, -2(%rdi, %rdx)
185 jmp 13f
18612:
187 cmp $1, %rdx
188 jb 13f
189 /*
190 * Move data for 1 byte.
191 */
192 movb (%rsi), %r11b
193 movb %r11b, (%rdi)
19413:
195 retq
196 CFI_ENDPROC
197ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b34..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void *dest, const void *src, size_t count)
10{
11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 char *ret;
13
14 __asm__ __volatile__(
15 /* Handle more 32bytes in loop */
16 "mov %2, %3\n\t"
17 "cmp $0x20, %0\n\t"
18 "jb 1f\n\t"
19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
191}
192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49df..67743977398b 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
23#include <asm/dwarf2.h> 23#include <asm/dwarf2.h>
24 24
25#define save_common_regs \ 25#define save_common_regs \
26 pushq %rdi; \ 26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq %rsi; \ 27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
28 pushq %rcx; \ 28 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
29 pushq %r8; \ 29 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
30 pushq %r9; \ 30 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
31 pushq %r10; \ 31 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
32 pushq %r11 32 pushq_cfi %r11; CFI_REL_OFFSET r11, 0
33 33
34#define restore_common_regs \ 34#define restore_common_regs \
35 popq %r11; \ 35 popq_cfi %r11; CFI_RESTORE r11; \
36 popq %r10; \ 36 popq_cfi %r10; CFI_RESTORE r10; \
37 popq %r9; \ 37 popq_cfi %r9; CFI_RESTORE r9; \
38 popq %r8; \ 38 popq_cfi %r8; CFI_RESTORE r8; \
39 popq %rcx; \ 39 popq_cfi %rcx; CFI_RESTORE rcx; \
40 popq %rsi; \ 40 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq %rdi 41 popq_cfi %rdi; CFI_RESTORE rdi
42 42
43/* Fix up special calling conventions */ 43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 44ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC
45 save_common_regs 46 save_common_regs
46 pushq %rdx 47 pushq_cfi %rdx
48 CFI_REL_OFFSET rdx, 0
47 movq %rax,%rdi 49 movq %rax,%rdi
48 call rwsem_down_read_failed 50 call rwsem_down_read_failed
49 popq %rdx 51 popq_cfi %rdx
52 CFI_RESTORE rdx
50 restore_common_regs 53 restore_common_regs
51 ret 54 ret
52 ENDPROC(call_rwsem_down_read_failed) 55 CFI_ENDPROC
56ENDPROC(call_rwsem_down_read_failed)
53 57
54ENTRY(call_rwsem_down_write_failed) 58ENTRY(call_rwsem_down_write_failed)
59 CFI_STARTPROC
55 save_common_regs 60 save_common_regs
56 movq %rax,%rdi 61 movq %rax,%rdi
57 call rwsem_down_write_failed 62 call rwsem_down_write_failed
58 restore_common_regs 63 restore_common_regs
59 ret 64 ret
60 ENDPROC(call_rwsem_down_write_failed) 65 CFI_ENDPROC
66ENDPROC(call_rwsem_down_write_failed)
61 67
62ENTRY(call_rwsem_wake) 68ENTRY(call_rwsem_wake)
69 CFI_STARTPROC
63 decl %edx /* do nothing if still outstanding active readers */ 70 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f 71 jnz 1f
65 save_common_regs 72 save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
67 call rwsem_wake 74 call rwsem_wake
68 restore_common_regs 75 restore_common_regs
691: ret 761: ret
70 ENDPROC(call_rwsem_wake) 77 CFI_ENDPROC
78ENDPROC(call_rwsem_wake)
71 79
72/* Fix up special calling conventions */ 80/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake) 81ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC
74 save_common_regs 83 save_common_regs
75 pushq %rdx 84 pushq_cfi %rdx
85 CFI_REL_OFFSET rdx, 0
76 movq %rax,%rdi 86 movq %rax,%rdi
77 call rwsem_downgrade_wake 87 call rwsem_downgrade_wake
78 popq %rdx 88 popq_cfi %rdx
89 CFI_RESTORE rdx
79 restore_common_regs 90 restore_common_regs
80 ret 91 ret
81 ENDPROC(call_rwsem_downgrade_wake) 92 CFI_ENDPROC
93ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe4741782..06691daa4108 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
36 */ 36 */
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed) 38ENTRY(__write_lock_failed)
39 CFI_STARTPROC simple 39 CFI_STARTPROC
40 FRAME 40 FRAME
412: LOCK_PREFIX 412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax) 42 addl $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
74/* Fix up special calling conventions */ 74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed) 75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC 76 CFI_STARTPROC
77 push %ecx 77 pushl_cfi %ecx
78 CFI_ADJUST_CFA_OFFSET 4
79 CFI_REL_OFFSET ecx,0 78 CFI_REL_OFFSET ecx,0
80 push %edx 79 pushl_cfi %edx
81 CFI_ADJUST_CFA_OFFSET 4
82 CFI_REL_OFFSET edx,0 80 CFI_REL_OFFSET edx,0
83 call rwsem_down_read_failed 81 call rwsem_down_read_failed
84 pop %edx 82 popl_cfi %edx
85 CFI_ADJUST_CFA_OFFSET -4 83 popl_cfi %ecx
86 pop %ecx
87 CFI_ADJUST_CFA_OFFSET -4
88 ret 84 ret
89 CFI_ENDPROC 85 CFI_ENDPROC
90 ENDPROC(call_rwsem_down_read_failed) 86 ENDPROC(call_rwsem_down_read_failed)
91 87
92ENTRY(call_rwsem_down_write_failed) 88ENTRY(call_rwsem_down_write_failed)
93 CFI_STARTPROC 89 CFI_STARTPROC
94 push %ecx 90 pushl_cfi %ecx
95 CFI_ADJUST_CFA_OFFSET 4
96 CFI_REL_OFFSET ecx,0 91 CFI_REL_OFFSET ecx,0
97 calll rwsem_down_write_failed 92 calll rwsem_down_write_failed
98 pop %ecx 93 popl_cfi %ecx
99 CFI_ADJUST_CFA_OFFSET -4
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102 ENDPROC(call_rwsem_down_write_failed) 96 ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
105 CFI_STARTPROC 99 CFI_STARTPROC
106 decw %dx /* do nothing if still outstanding active readers */ 100 decw %dx /* do nothing if still outstanding active readers */
107 jnz 1f 101 jnz 1f
108 push %ecx 102 pushl_cfi %ecx
109 CFI_ADJUST_CFA_OFFSET 4
110 CFI_REL_OFFSET ecx,0 103 CFI_REL_OFFSET ecx,0
111 call rwsem_wake 104 call rwsem_wake
112 pop %ecx 105 popl_cfi %ecx
113 CFI_ADJUST_CFA_OFFSET -4
1141: ret 1061: ret
115 CFI_ENDPROC 107 CFI_ENDPROC
116 ENDPROC(call_rwsem_wake) 108 ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
118/* Fix up special calling conventions */ 110/* Fix up special calling conventions */
119ENTRY(call_rwsem_downgrade_wake) 111ENTRY(call_rwsem_downgrade_wake)
120 CFI_STARTPROC 112 CFI_STARTPROC
121 push %ecx 113 pushl_cfi %ecx
122 CFI_ADJUST_CFA_OFFSET 4
123 CFI_REL_OFFSET ecx,0 114 CFI_REL_OFFSET ecx,0
124 push %edx 115 pushl_cfi %edx
125 CFI_ADJUST_CFA_OFFSET 4
126 CFI_REL_OFFSET edx,0 116 CFI_REL_OFFSET edx,0
127 call rwsem_downgrade_wake 117 call rwsem_downgrade_wake
128 pop %edx 118 popl_cfi %edx
129 CFI_ADJUST_CFA_OFFSET -4 119 popl_cfi %ecx
130 pop %ecx
131 CFI_ADJUST_CFA_OFFSET -4
132 ret 120 ret
133 CFI_ENDPROC 121 CFI_ENDPROC
134 ENDPROC(call_rwsem_downgrade_wake) 122 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ecc..2930ae05d773 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
7 7
8 #include <linux/linkage.h> 8 #include <linux/linkage.h>
9 9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS 10#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */ 11 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func 12 .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a5428..782b082c9ff7 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
22 CFI_ENDPROC 22 CFI_ENDPROC
23 .endm 23 .endm
24 24
25 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
26 .macro thunk_retrax name,func
27 .globl \name
28\name:
29 CFI_STARTPROC
30 SAVE_ARGS
31 call \func
32 jmp restore_norax
33 CFI_ENDPROC
34 .endm
35
36
37 .section .sched.text, "ax"
38#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
39 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
40 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
41 thunk rwsem_wake_thunk,rwsem_wake
42 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
43#endif
44
45#ifdef CONFIG_TRACE_IRQFLAGS 25#ifdef CONFIG_TRACE_IRQFLAGS
46 /* put return address in rdi (arg1) */ 26 /* put return address in rdi (arg1) */
47 .macro thunk_ra name,func 27 .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
72 RESTORE_ARGS 52 RESTORE_ARGS
73 ret 53 ret
74 CFI_ENDPROC 54 CFI_ENDPROC
75
76 CFI_STARTPROC
77 SAVE_ARGS
78restore_norax:
79 RESTORE_ARGS 1
80 ret
81 CFI_ENDPROC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8]; 29static unsigned char __initdata nodeids[8];
31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
32 30
33static __init int find_northbridge(void) 31static __init int find_northbridge(void)
34{ 32{
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
51 return num; 49 return num;
52 } 50 }
53 51
54 return -1; 52 return -ENOENT;
55} 53}
56 54
57static __init void early_get_boot_cpu_id(void) 55static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
69#endif 67#endif
70} 68}
71 69
72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 70int __init amd_numa_init(void)
73{ 71{
74 unsigned long start = PFN_PHYS(start_pfn); 72 unsigned long start = PFN_PHYS(0);
75 unsigned long end = PFN_PHYS(end_pfn); 73 unsigned long end = PFN_PHYS(max_pfn);
76 unsigned numnodes; 74 unsigned numnodes;
77 unsigned long prevbase; 75 unsigned long prevbase;
78 int i, nb, found = 0; 76 int i, j, nb;
79 u32 nodeid, reg; 77 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base;
80 79
81 if (!early_pci_allowed()) 80 if (!early_pci_allowed())
82 return -1; 81 return -EINVAL;
83 82
84 nb = find_northbridge(); 83 nb = find_northbridge();
85 if (nb < 0) 84 if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
90 reg = read_pci_config(0, nb, 0, 0x60); 89 reg = read_pci_config(0, nb, 0, 0x60);
91 numnodes = ((reg >> 4) & 0xF) + 1; 90 numnodes = ((reg >> 4) & 0xF) + 1;
92 if (numnodes <= 1) 91 if (numnodes <= 1)
93 return -1; 92 return -ENOENT;
94 93
95 pr_info("Number of physical nodes %d\n", numnodes); 94 pr_info("Number of physical nodes %d\n", numnodes);
96 95
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
121 if ((base >> 8) & 3 || (limit >> 8) & 3) { 120 if ((base >> 8) & 3 || (limit >> 8) & 3) {
122 pr_err("Node %d using interleaving mode %lx/%lx\n", 121 pr_err("Node %d using interleaving mode %lx/%lx\n",
123 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 122 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
124 return -1; 123 return -EINVAL;
125 } 124 }
126 if (node_isset(nodeid, nodes_parsed)) { 125 if (node_isset(nodeid, numa_nodes_parsed)) {
127 pr_info("Node %d already present, skipping\n", 126 pr_info("Node %d already present, skipping\n",
128 nodeid); 127 nodeid);
129 continue; 128 continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
160 if (prevbase > base) { 159 if (prevbase > base) {
161 pr_err("Node map not sorted %lx,%lx\n", 160 pr_err("Node map not sorted %lx,%lx\n",
162 prevbase, base); 161 prevbase, base);
163 return -1; 162 return -EINVAL;
164 } 163 }
165 164
166 pr_info("Node %d MemBase %016lx Limit %016lx\n", 165 pr_info("Node %d MemBase %016lx Limit %016lx\n",
167 nodeid, base, limit); 166 nodeid, base, limit);
168 167
169 found++;
170
171 nodes[nodeid].start = base;
172 nodes[nodeid].end = limit;
173
174 prevbase = base; 168 prevbase = base;
175 169 numa_add_memblk(nodeid, base, limit);
176 node_set(nodeid, nodes_parsed); 170 node_set(nodeid, numa_nodes_parsed);
177 }
178
179 if (!found)
180 return -1;
181 return 0;
182}
183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 } 171 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211 172
212/* 173 if (!nodes_weight(numa_nodes_parsed))
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 174 return -ENOENT;
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226 175
176 /*
177 * We seem to have valid NUMA configuration. Map apicids to nodes
178 * using the coreid bits from early_identify_cpu.
179 */
227 bits = boot_cpu_data.x86_coreid_bits; 180 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits; 181 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
255{
256 unsigned int bits;
257 unsigned int cores;
258 unsigned int apicid_base;
259 int i;
260
261 BUG_ON(nodes_empty(nodes_parsed));
262 node_possible_map = nodes_parsed;
263 memnode_shift = compute_hash_shift(nodes, 8, NULL);
264 if (memnode_shift < 0) {
265 pr_err("No NUMA node hash function found. Contact maintainer\n");
266 return -1;
267 }
268 pr_info("Using node hash shift of %d\n", memnode_shift);
269
270 /* use the coreid bits from early_identify_cpu */
271 bits = boot_cpu_data.x86_coreid_bits;
272 cores = (1<<bits);
273 apicid_base = 0; 182 apicid_base = 0;
183
274 /* get the APIC ID of the BSP early for systems with apicid lifting */ 184 /* get the APIC ID of the BSP early for systems with apicid lifting */
275 early_get_boot_cpu_id(); 185 early_get_boot_cpu_id();
276 if (boot_cpu_physical_apicid > 0) { 186 if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
278 apicid_base = boot_cpu_physical_apicid; 188 apicid_base = boot_cpu_physical_apicid;
279 } 189 }
280 190
281 for_each_node_mask(i, node_possible_map) { 191 for_each_node_mask(i, numa_nodes_parsed)
282 int j;
283
284 memblock_x86_register_active_regions(i,
285 nodes[i].start >> PAGE_SHIFT,
286 nodes[i].end >> PAGE_SHIFT);
287 for (j = apicid_base; j < cores + apicid_base; j++) 192 for (j = apicid_base; j < cores + apicid_base; j++)
288 apicid_to_node[(i << bits) + j] = i; 193 set_apicid_to_node((i << bits) + j, i);
289 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
290 }
291 194
292 numa_init_array();
293 return 0; 195 return 0;
294} 196}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb882a4..20e3f8702d1e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,15 +229,14 @@ void vmalloc_sync_all(void)
229 for (address = VMALLOC_START & PMD_MASK; 229 for (address = VMALLOC_START & PMD_MASK;
230 address >= TASK_SIZE && address < FIXADDR_TOP; 230 address >= TASK_SIZE && address < FIXADDR_TOP;
231 address += PMD_SIZE) { 231 address += PMD_SIZE) {
232
233 unsigned long flags;
234 struct page *page; 232 struct page *page;
235 233
236 spin_lock_irqsave(&pgd_lock, flags); 234 spin_lock(&pgd_lock);
237 list_for_each_entry(page, &pgd_list, lru) { 235 list_for_each_entry(page, &pgd_list, lru) {
238 spinlock_t *pgt_lock; 236 spinlock_t *pgt_lock;
239 pmd_t *ret; 237 pmd_t *ret;
240 238
239 /* the pgt_lock only for Xen */
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 240 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242 241
243 spin_lock(pgt_lock); 242 spin_lock(pgt_lock);
@@ -247,7 +246,7 @@ void vmalloc_sync_all(void)
247 if (!ret) 246 if (!ret)
248 break; 247 break;
249 } 248 }
250 spin_unlock_irqrestore(&pgd_lock, flags); 249 spin_unlock(&pgd_lock);
251 } 250 }
252} 251}
253 252
@@ -828,6 +827,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
828 unsigned long address, unsigned int fault) 827 unsigned long address, unsigned int fault)
829{ 828{
830 if (fault & VM_FAULT_OOM) { 829 if (fault & VM_FAULT_OOM) {
830 /* Kernel mode? Handle exceptions or die: */
831 if (!(error_code & PF_USER)) {
832 up_read(&current->mm->mmap_sem);
833 no_context(regs, error_code, address);
834 return;
835 }
836
831 out_of_memory(regs, error_code, address); 837 out_of_memory(regs, error_code, address);
832 } else { 838 } else {
833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 839 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..d4203988504a 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20 20
21unsigned long __initdata e820_table_start; 21unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata e820_table_end; 22unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata e820_table_top; 23unsigned long __meminitdata pgt_buf_top;
24 24
25int after_bootmem; 25int after_bootmem;
26 26
@@ -33,7 +33,7 @@ int direct_gbpages
33static void __init find_early_table_space(unsigned long end, int use_pse, 33static void __init find_early_table_space(unsigned long end, int use_pse,
34 int use_gbpages) 34 int use_gbpages)
35{ 35{
36 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
37 phys_addr_t base; 37 phys_addr_t base;
38 38
39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
66 /* for fixmap */ 66 /* for fixmap */
67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
68#endif
69 68
70 /* 69 good_end = max_pfn_mapped << PAGE_SHIFT;
71 * RED-PEN putting page tables only on node 0 could
72 * cause a hotspot and fill up ZONE_DMA. The page tables
73 * need roughly 0.5KB per GB.
74 */
75#ifdef CONFIG_X86_32
76 start = 0x7000;
77#else
78 start = 0x8000;
79#endif 70#endif
80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, 71
81 tables, PAGE_SIZE); 72 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
82 if (base == MEMBLOCK_ERROR) 73 if (base == MEMBLOCK_ERROR)
83 panic("Cannot find space for the kernel page tables"); 74 panic("Cannot find space for the kernel page tables");
84 75
85 e820_table_start = base >> PAGE_SHIFT; 76 pgt_buf_start = base >> PAGE_SHIFT;
86 e820_table_end = e820_table_start; 77 pgt_buf_end = pgt_buf_start;
87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 78 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
88 79
89 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 80 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
90 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
91} 82}
92 83
93struct map_range { 84struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 load_cr3(swapper_pg_dir); 270 load_cr3(swapper_pg_dir);
280#endif 271#endif
281 272
282#ifdef CONFIG_X86_64
283 if (!after_bootmem && !start) {
284 pud_t *pud;
285 pmd_t *pmd;
286
287 mmu_cr4_features = read_cr4();
288
289 /*
290 * _brk_end cannot change anymore, but it and _end may be
291 * located on different 2M pages. cleanup_highmap(), however,
292 * can only consider _end when it runs, so destroy any
293 * mappings beyond _brk_end here.
294 */
295 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
296 pmd = pmd_offset(pud, _brk_end - 1);
297 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
298 pmd_clear(pmd);
299 }
300#endif
301 __flush_tlb_all(); 273 __flush_tlb_all();
302 274
303 if (!after_bootmem && e820_table_end > e820_table_start) 275 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, 276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
305 e820_table_end << PAGE_SHIFT, "PGTABLE"); 277 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
306 278
307 if (!after_bootmem) 279 if (!after_bootmem)
308 early_memtest(start, end); 280 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..80088f994193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
62 62
63static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
64{ 64{
65 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
66 void *adr; 66 void *adr;
67 67
68 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
69 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
70 70
71 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
166 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
167 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
168 pte_t *newpte; 168 pte_t *newpte;
169 int i; 169 int i;
170 170
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
644} 644}
645 645
646#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
647void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
648 int acpi, int k8)
649{ 648{
650#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
651 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
@@ -918,7 +917,7 @@ static void mark_nxdata_nx(void)
918{ 917{
919 /* 918 /*
920 * When this called, init has already been executed and released, 919 * When this called, init has already been executed and released,
921 * so everything past _etext sould be NX. 920 * so everything past _etext should be NX.
922 */ 921 */
923 unsigned long start = PFN_ALIGN(_etext); 922 unsigned long start = PFN_ALIGN(_etext);
924 /* 923 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a59296af80..794233587287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,8 @@
51#include <asm/numa.h> 51#include <asm/numa.h>
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/init.h> 53#include <asm/init.h>
54#include <asm/uv/uv.h>
55#include <asm/setup.h>
54 56
55static int __init parse_direct_gbpages_off(char *arg) 57static int __init parse_direct_gbpages_off(char *arg)
56{ 58{
@@ -105,18 +107,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
105 107
106 for (address = start; address <= end; address += PGDIR_SIZE) { 108 for (address = start; address <= end; address += PGDIR_SIZE) {
107 const pgd_t *pgd_ref = pgd_offset_k(address); 109 const pgd_t *pgd_ref = pgd_offset_k(address);
108 unsigned long flags;
109 struct page *page; 110 struct page *page;
110 111
111 if (pgd_none(*pgd_ref)) 112 if (pgd_none(*pgd_ref))
112 continue; 113 continue;
113 114
114 spin_lock_irqsave(&pgd_lock, flags); 115 spin_lock(&pgd_lock);
115 list_for_each_entry(page, &pgd_list, lru) { 116 list_for_each_entry(page, &pgd_list, lru) {
116 pgd_t *pgd; 117 pgd_t *pgd;
117 spinlock_t *pgt_lock; 118 spinlock_t *pgt_lock;
118 119
119 pgd = (pgd_t *)page_address(page) + pgd_index(address); 120 pgd = (pgd_t *)page_address(page) + pgd_index(address);
121 /* the pgt_lock only for Xen */
120 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 122 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
121 spin_lock(pgt_lock); 123 spin_lock(pgt_lock);
122 124
@@ -128,7 +130,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
128 130
129 spin_unlock(pgt_lock); 131 spin_unlock(pgt_lock);
130 } 132 }
131 spin_unlock_irqrestore(&pgd_lock, flags); 133 spin_unlock(&pgd_lock);
132 } 134 }
133} 135}
134 136
@@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
293 * to the compile time generated pmds. This results in invalid pmds up 295 * to the compile time generated pmds. This results in invalid pmds up
294 * to the point where we hit the physaddr 0 mapping. 296 * to the point where we hit the physaddr 0 mapping.
295 * 297 *
296 * We limit the mappings to the region from _text to _end. _end is 298 * We limit the mappings to the region from _text to _brk_end. _brk_end
297 * rounded up to the 2MB boundary. This catches the invalid pmds as 299 * is rounded up to the 2MB boundary. This catches the invalid pmds as
298 * well, as they are located before _text: 300 * well, as they are located before _text:
299 */ 301 */
300void __init cleanup_highmap(void) 302void __init cleanup_highmap(void)
301{ 303{
302 unsigned long vaddr = __START_KERNEL_map; 304 unsigned long vaddr = __START_KERNEL_map;
303 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
304 pmd_t *pmd = level2_kernel_pgt; 307 pmd_t *pmd = level2_kernel_pgt;
305 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
306 308
307 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
308 if (pmd_none(*pmd)) 310 if (pmd_none(*pmd))
309 continue; 311 continue;
310 if (vaddr < (unsigned long) _text || vaddr > end) 312 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -314,7 +316,7 @@ void __init cleanup_highmap(void)
314 316
315static __ref void *alloc_low_page(unsigned long *phys) 317static __ref void *alloc_low_page(unsigned long *phys)
316{ 318{
317 unsigned long pfn = e820_table_end++; 319 unsigned long pfn = pgt_buf_end++;
318 void *adr; 320 void *adr;
319 321
320 if (after_bootmem) { 322 if (after_bootmem) {
@@ -324,7 +326,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 return adr; 326 return adr;
325 } 327 }
326 328
327 if (pfn >= e820_table_top) 329 if (pfn >= pgt_buf_top)
328 panic("alloc_low_page: ran out of memory"); 330 panic("alloc_low_page: ran out of memory");
329 331
330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +335,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
333 return adr; 335 return adr;
334} 336}
335 337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
336static __ref void unmap_low_page(void *adr) 354static __ref void unmap_low_page(void *adr)
337{ 355{
338 if (after_bootmem) 356 if (after_bootmem)
339 return; 357 return;
340 358
341 early_iounmap(adr, PAGE_SIZE); 359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
342} 360}
343 361
344static unsigned long __meminit 362static unsigned long __meminit
@@ -386,15 +404,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
386} 404}
387 405
388static unsigned long __meminit 406static unsigned long __meminit
389phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
390 pgprot_t prot)
391{
392 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
393
394 return phys_pte_init(pte, address, end, prot);
395}
396
397static unsigned long __meminit
398phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 407phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
399 unsigned long page_size_mask, pgprot_t prot) 408 unsigned long page_size_mask, pgprot_t prot)
400{ 409{
@@ -420,8 +429,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
420 if (pmd_val(*pmd)) { 429 if (pmd_val(*pmd)) {
421 if (!pmd_large(*pmd)) { 430 if (!pmd_large(*pmd)) {
422 spin_lock(&init_mm.page_table_lock); 431 spin_lock(&init_mm.page_table_lock);
423 last_map_addr = phys_pte_update(pmd, address, 432 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
433 last_map_addr = phys_pte_init(pte, address,
424 end, prot); 434 end, prot);
435 unmap_low_page(pte);
425 spin_unlock(&init_mm.page_table_lock); 436 spin_unlock(&init_mm.page_table_lock);
426 continue; 437 continue;
427 } 438 }
@@ -468,18 +479,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
468} 479}
469 480
470static unsigned long __meminit 481static unsigned long __meminit
471phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
472 unsigned long page_size_mask, pgprot_t prot)
473{
474 pmd_t *pmd = pmd_offset(pud, 0);
475 unsigned long last_map_addr;
476
477 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
478 __flush_tlb_all();
479 return last_map_addr;
480}
481
482static unsigned long __meminit
483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 482phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
484 unsigned long page_size_mask) 483 unsigned long page_size_mask)
485{ 484{
@@ -504,8 +503,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
504 503
505 if (pud_val(*pud)) { 504 if (pud_val(*pud)) {
506 if (!pud_large(*pud)) { 505 if (!pud_large(*pud)) {
507 last_map_addr = phys_pmd_update(pud, addr, end, 506 pmd = map_low_page(pmd_offset(pud, 0));
507 last_map_addr = phys_pmd_init(pmd, addr, end,
508 page_size_mask, prot); 508 page_size_mask, prot);
509 unmap_low_page(pmd);
510 __flush_tlb_all();
509 continue; 511 continue;
510 } 512 }
511 /* 513 /*
@@ -553,17 +555,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
553 return last_map_addr; 555 return last_map_addr;
554} 556}
555 557
556static unsigned long __meminit
557phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
558 unsigned long page_size_mask)
559{
560 pud_t *pud;
561
562 pud = (pud_t *)pgd_page_vaddr(*pgd);
563
564 return phys_pud_init(pud, addr, end, page_size_mask);
565}
566
567unsigned long __meminit 558unsigned long __meminit
568kernel_physical_mapping_init(unsigned long start, 559kernel_physical_mapping_init(unsigned long start,
569 unsigned long end, 560 unsigned long end,
@@ -587,8 +578,10 @@ kernel_physical_mapping_init(unsigned long start,
587 next = end; 578 next = end;
588 579
589 if (pgd_val(*pgd)) { 580 if (pgd_val(*pgd)) {
590 last_map_addr = phys_pud_update(pgd, __pa(start), 581 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
582 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 583 __pa(end), page_size_mask);
584 unmap_low_page(pud);
592 continue; 585 continue;
593 } 586 }
594 587
@@ -612,10 +605,9 @@ kernel_physical_mapping_init(unsigned long start,
612} 605}
613 606
614#ifndef CONFIG_NUMA 607#ifndef CONFIG_NUMA
615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 608void __init initmem_init(void)
616 int acpi, int k8)
617{ 609{
618 memblock_x86_register_active_regions(0, start_pfn, end_pfn); 610 memblock_x86_register_active_regions(0, 0, max_pfn);
619} 611}
620#endif 612#endif
621 613
@@ -870,18 +862,18 @@ static struct vm_area_struct gate_vma = {
870 .vm_flags = VM_READ | VM_EXEC 862 .vm_flags = VM_READ | VM_EXEC
871}; 863};
872 864
873struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 865struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
874{ 866{
875#ifdef CONFIG_IA32_EMULATION 867#ifdef CONFIG_IA32_EMULATION
876 if (test_tsk_thread_flag(tsk, TIF_IA32)) 868 if (!mm || mm->context.ia32_compat)
877 return NULL; 869 return NULL;
878#endif 870#endif
879 return &gate_vma; 871 return &gate_vma;
880} 872}
881 873
882int in_gate_area(struct task_struct *task, unsigned long addr) 874int in_gate_area(struct mm_struct *mm, unsigned long addr)
883{ 875{
884 struct vm_area_struct *vma = get_gate_vma(task); 876 struct vm_area_struct *vma = get_gate_vma(mm);
885 877
886 if (!vma) 878 if (!vma)
887 return 0; 879 return 0;
@@ -890,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
890} 882}
891 883
892/* 884/*
893 * Use this when you have no reliable task/vma, typically from interrupt 885 * Use this when you have no reliable mm, typically from interrupt
894 * context. It is less reliable than using the task's vma and may give 886 * context. It is less reliable than using a task's mm and may give
895 * false positives: 887 * false positives.
896 */ 888 */
897int in_gate_area_no_task(unsigned long addr) 889int in_gate_area_no_mm(unsigned long addr)
898{ 890{
899 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); 891 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
900} 892}
@@ -908,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma)
908 return NULL; 900 return NULL;
909} 901}
910 902
903#ifdef CONFIG_X86_UV
904#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
905
906unsigned long memory_block_size_bytes(void)
907{
908 if (is_uv_system()) {
909 printk(KERN_INFO "UV: memory block size 2GB\n");
910 return 2UL * 1024 * 1024 * 1024;
911 }
912 return MIN_MEMORY_BLOCK_SIZE;
913}
914#endif
915
911#ifdef CONFIG_SPARSEMEM_VMEMMAP 916#ifdef CONFIG_SPARSEMEM_VMEMMAP
912/* 917/*
913 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 918 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a38..745258dfc4dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
26early_param("numa", numa_setup); 26early_param("numa", numa_setup);
27 27
28/* 28/*
29 * Which logical CPUs are on which nodes 29 * apicid, cpu, node mappings
30 */ 30 */
31s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33};
34
31cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
32EXPORT_SYMBOL(node_to_cpumask_map); 36EXPORT_SYMBOL(node_to_cpumask_map);
33 37
34/* 38/*
39 * Map cpu index to node index
40 */
41DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
42EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44void __cpuinit numa_set_node(int cpu, int node)
45{
46 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
47
48 /* early setting, no percpu area yet */
49 if (cpu_to_node_map) {
50 cpu_to_node_map[cpu] = node;
51 return;
52 }
53
54#ifdef CONFIG_DEBUG_PER_CPU_MAPS
55 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
56 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
57 dump_stack();
58 return;
59 }
60#endif
61 per_cpu(x86_cpu_to_node_map, cpu) = node;
62
63 if (node != NUMA_NO_NODE)
64 set_cpu_numa_node(cpu, node);
65}
66
67void __cpuinit numa_clear_node(int cpu)
68{
69 numa_set_node(cpu, NUMA_NO_NODE);
70}
71
72/*
35 * Allocate node_to_cpumask_map based on number of available nodes 73 * Allocate node_to_cpumask_map based on number of available nodes
36 * Requires node_possible_map to be valid. 74 * Requires node_possible_map to be valid.
37 * 75 *
@@ -57,7 +95,169 @@ void __init setup_node_to_cpumask_map(void)
57 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
58} 96}
59 97
60#ifdef CONFIG_DEBUG_PER_CPU_MAPS 98/*
99 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
101 * mapping. To avoid this fill in the mapping for all possible CPUs,
102 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes.
104 */
105void __init numa_init_array(void)
106{
107 int rr, i;
108
109 rr = first_node(node_online_map);
110 for (i = 0; i < nr_cpu_ids; i++) {
111 if (early_cpu_to_node(i) != NUMA_NO_NODE)
112 continue;
113 numa_set_node(i, rr);
114 rr = next_node(rr, node_online_map);
115 if (rr == MAX_NUMNODES)
116 rr = first_node(node_online_map);
117 }
118}
119
120static __init int find_near_online_node(int node)
121{
122 int n, val;
123 int min_val = INT_MAX;
124 int best_node = -1;
125
126 for_each_online_node(n) {
127 val = node_distance(node, n);
128
129 if (val < min_val) {
130 min_val = val;
131 best_node = n;
132 }
133 }
134
135 return best_node;
136}
137
138/*
139 * Setup early cpu_to_node.
140 *
141 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
142 * and apicid_to_node[] tables have valid entries for a CPU.
143 * This means we skip cpu_to_node[] initialisation for NUMA
144 * emulation and faking node case (when running a kernel compiled
145 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
146 * is already initialized in a round robin manner at numa_init_array,
147 * prior to this call, and this initialization is good enough
148 * for the fake NUMA cases.
149 *
150 * Called before the per_cpu areas are setup.
151 */
152void __init init_cpu_to_node(void)
153{
154 int cpu;
155 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
156
157 BUG_ON(cpu_to_apicid == NULL);
158
159 for_each_possible_cpu(cpu) {
160 int node = numa_cpu_node(cpu);
161
162 if (node == NUMA_NO_NODE)
163 continue;
164 if (!node_online(node))
165 node = find_near_online_node(node);
166 numa_set_node(cpu, node);
167 }
168}
169
170#ifndef CONFIG_DEBUG_PER_CPU_MAPS
171
172# ifndef CONFIG_NUMA_EMU
173void __cpuinit numa_add_cpu(int cpu)
174{
175 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
176}
177
178void __cpuinit numa_remove_cpu(int cpu)
179{
180 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
181}
182# endif /* !CONFIG_NUMA_EMU */
183
184#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
185
186int __cpu_to_node(int cpu)
187{
188 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
189 printk(KERN_WARNING
190 "cpu_to_node(%d): usage too early!\n", cpu);
191 dump_stack();
192 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
193 }
194 return per_cpu(x86_cpu_to_node_map, cpu);
195}
196EXPORT_SYMBOL(__cpu_to_node);
197
198/*
199 * Same function as cpu_to_node() but used if called before the
200 * per_cpu areas are setup.
201 */
202int early_cpu_to_node(int cpu)
203{
204 if (early_per_cpu_ptr(x86_cpu_to_node_map))
205 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
206
207 if (!cpu_possible(cpu)) {
208 printk(KERN_WARNING
209 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
210 dump_stack();
211 return NUMA_NO_NODE;
212 }
213 return per_cpu(x86_cpu_to_node_map, cpu);
214}
215
216void debug_cpumask_set_cpu(int cpu, int node, bool enable)
217{
218 struct cpumask *mask;
219 char buf[64];
220
221 if (node == NUMA_NO_NODE) {
222 /* early_cpu_to_node() already emits a warning and trace */
223 return;
224 }
225 mask = node_to_cpumask_map[node];
226 if (!mask) {
227 pr_err("node_to_cpumask_map[%i] NULL\n", node);
228 dump_stack();
229 return;
230 }
231
232 if (enable)
233 cpumask_set_cpu(cpu, mask);
234 else
235 cpumask_clear_cpu(cpu, mask);
236
237 cpulist_scnprintf(buf, sizeof(buf), mask);
238 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
239 enable ? "numa_add_cpu" : "numa_remove_cpu",
240 cpu, node, buf);
241 return;
242}
243
244# ifndef CONFIG_NUMA_EMU
245static void __cpuinit numa_set_cpumask(int cpu, bool enable)
246{
247 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
248}
249
250void __cpuinit numa_add_cpu(int cpu)
251{
252 numa_set_cpumask(cpu, true);
253}
254
255void __cpuinit numa_remove_cpu(int cpu)
256{
257 numa_set_cpumask(cpu, false);
258}
259# endif /* !CONFIG_NUMA_EMU */
260
61/* 261/*
62 * Returns a pointer to the bitmask of CPUs on Node 'node'. 262 * Returns a pointer to the bitmask of CPUs on Node 'node'.
63 */ 263 */
@@ -80,4 +280,5 @@ const struct cpumask *cpumask_of_node(int node)
80 return node_to_cpumask_map[node]; 280 return node_to_cpumask_map[node];
81} 281}
82EXPORT_SYMBOL(cpumask_of_node); 282EXPORT_SYMBOL(cpumask_of_node);
83#endif 283
284#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
112static unsigned long kva_pages; 112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
113/* 119/*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 120 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 121 * a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
346 (ulong) node_remap_end_vaddr[nid]); 352 (ulong) node_remap_end_vaddr[nid]);
347} 353}
348 354
349void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 355void __init initmem_init(void)
350 int acpi, int k8)
351{ 356{
352 int nid; 357 int nid;
353 long kva_target_pfn; 358 long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
361 */ 366 */
362 367
363 get_memcfg_numa(); 368 get_memcfg_numa();
369 numa_init_array();
364 370
365 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
366 372
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 95ea1551eebc..85b52fc03084 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/acpi.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/dma.h> 20#include <asm/dma.h>
20#include <asm/numa.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
23 23
24#include "numa_internal.h"
25
24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
25EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
26 28
27struct memnode memnode; 29nodemask_t numa_nodes_parsed __initdata;
28 30
29s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 31struct memnode memnode;
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31};
32 32
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36/* 36static struct numa_meminfo numa_meminfo __initdata;
37 * Map cpu index to node index 37
38 */ 38static int numa_distance_cnt;
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 39static u8 *numa_distance;
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41 40
42/* 41/*
43 * Given a shift value, try to populate memnodemap[] 42 * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
46 * 0 if memnodmap[] too small (of shift too small) 45 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big) 46 * -1 if node overlap or lost ram (shift too big)
48 */ 47 */
49static int __init populate_memnodemap(const struct bootnode *nodes, 48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
50 int numnodes, int shift, int *nodeids)
51{ 49{
52 unsigned long addr, end; 50 unsigned long addr, end;
53 int i, res = -1; 51 int i, res = -1;
54 52
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) { 54 for (i = 0; i < mi->nr_blks; i++) {
57 addr = nodes[i].start; 55 addr = mi->blk[i].start;
58 end = nodes[i].end; 56 end = mi->blk[i].end;
59 if (addr >= end) 57 if (addr >= end)
60 continue; 58 continue;
61 if ((end >> shift) >= memnodemapsize) 59 if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
63 do { 61 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1; 63 return -1;
66 64 memnodemap[addr >> shift] = mi->blk[i].nid;
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift); 65 addr += (1UL << shift);
73 } while (addr < end); 66 } while (addr < end);
74 res = 1; 67 res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
86 79
87 addr = 0x8000; 80 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
90 nodemap_size, L1_CACHE_BYTES); 83 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == MEMBLOCK_ERROR) { 84 if (nodemap_addr == MEMBLOCK_ERROR) {
92 printk(KERN_ERR 85 printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
106 * The LSB of all start and end addresses in the node map is the value of the 99 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift. 100 * maximum possible shift.
108 */ 101 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
110 int numnodes)
111{ 103{
112 int i, nodes_used = 0; 104 int i, nodes_used = 0;
113 unsigned long start, end; 105 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0; 106 unsigned long bitfield = 0, memtop = 0;
115 107
116 for (i = 0; i < numnodes; i++) { 108 for (i = 0; i < mi->nr_blks; i++) {
117 start = nodes[i].start; 109 start = mi->blk[i].start;
118 end = nodes[i].end; 110 end = mi->blk[i].end;
119 if (start >= end) 111 if (start >= end)
120 continue; 112 continue;
121 bitfield |= start; 113 bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
131 return i; 123 return i;
132} 124}
133 125
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 126static int __init compute_hash_shift(const struct numa_meminfo *mi)
135 int *nodeids)
136{ 127{
137 int shift; 128 int shift;
138 129
139 shift = extract_lsb_from_nodes(nodes, numnodes); 130 shift = extract_lsb_from_nodes(mi);
140 if (allocate_cachealigned_memnodemap()) 131 if (allocate_cachealigned_memnodemap())
141 return -1; 132 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift); 134 shift);
144 135
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 136 if (populate_memnodemap(mi, shift) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to " 137 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE " 138 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift); 139 "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
188 return NULL; 179 return NULL;
189} 180}
190 181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
191/* Initialize bootmem allocator for a node */ 239/* Initialize bootmem allocator for a node */
192void __init 240void __init
193setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,696 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
234 node_set_online(nodeid); 282 node_set_online(nodeid);
235} 283}
236 284
237/* 285/**
238 * There are unfortunately some poorly designed mainboards around that 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
239 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 287 * @mi: numa_meminfo to clean up
240 * mapping. To avoid this fill in the mapping for all possible CPUs, 288 *
241 * as the number of CPUs is not known yet. We round robin the existing 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
242 * nodes. 290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
243 */ 294 */
244void __init numa_init_array(void) 295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
245{ 296{
246 int rr, i; 297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
247 300
248 rr = first_node(node_online_map); 301 for (i = 0; i < mi->nr_blks; i++) {
249 for (i = 0; i < nr_cpu_ids; i++) { 302 struct numa_memblk *bi = &mi->blk[i];
250 if (early_cpu_to_node(i) != NUMA_NO_NODE)
251 continue;
252 numa_set_node(i, rr);
253 rr = next_node(rr, node_online_map);
254 if (rr == MAX_NUMNODES)
255 rr = first_node(node_online_map);
256 }
257}
258
259#ifdef CONFIG_NUMA_EMU
260/* Numa emulation */
261static struct bootnode nodes[MAX_NUMNODES] __initdata;
262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
263static char *cmdline __initdata;
264 303
265void __init numa_emu_cmdline(char *str) 304 /* make sure all blocks are inside the limits */
266{ 305 bi->start = max(bi->start, low);
267 cmdline = str; 306 bi->end = min(bi->end, high);
268}
269 307
270static int __init setup_physnodes(unsigned long start, unsigned long end, 308 /* and there's no empty block */
271 int acpi, int amd) 309 if (bi->start >= bi->end) {
272{ 310 numa_remove_memblk_from(i--, mi);
273 int ret = 0;
274 int i;
275
276 memset(physnodes, 0, sizeof(physnodes));
277#ifdef CONFIG_ACPI_NUMA
278 if (acpi)
279 acpi_get_nodes(physnodes, start, end);
280#endif
281#ifdef CONFIG_AMD_NUMA
282 if (amd)
283 amd_get_nodes(physnodes);
284#endif
285 /*
286 * Basic sanity checking on the physical node map: there may be errors
287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
288 * kernel parameter is used.
289 */
290 for (i = 0; i < MAX_NUMNODES; i++) {
291 if (physnodes[i].start == physnodes[i].end)
292 continue;
293 if (physnodes[i].start > end) {
294 physnodes[i].end = physnodes[i].start;
295 continue;
296 }
297 if (physnodes[i].end < start) {
298 physnodes[i].start = physnodes[i].end;
299 continue; 311 continue;
300 } 312 }
301 if (physnodes[i].start < start)
302 physnodes[i].start = start;
303 if (physnodes[i].end > end)
304 physnodes[i].end = end;
305 ret++;
306 }
307
308 /*
309 * If no physical topology was detected, a single node is faked to cover
310 * the entire address space.
311 */
312 if (!ret) {
313 physnodes[ret].start = start;
314 physnodes[ret].end = end;
315 ret = 1;
316 }
317 return ret;
318}
319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
338/*
339 * Setups up nid to range from addr to addr + size. If the end
340 * boundary is greater than max_addr, then max_addr is used instead.
341 * The return value is 0 if there is additional memory left for
342 * allocation past addr and -1 otherwise. addr is adjusted to be at
343 * the end of the node.
344 */
345static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
346{
347 int ret = 0;
348 nodes[nid].start = *addr;
349 *addr += size;
350 if (*addr >= max_addr) {
351 *addr = max_addr;
352 ret = -1;
353 }
354 nodes[nid].end = *addr;
355 node_set(nid, node_possible_map);
356 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
357 nodes[nid].start, nodes[nid].end,
358 (nodes[nid].end - nodes[nid].start) >> 20);
359 return ret;
360}
361
362/*
363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
364 * to max_addr. The return value is the number of nodes allocated.
365 */
366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
367{
368 nodemask_t physnode_mask = NODE_MASK_NONE;
369 u64 size;
370 int big;
371 int ret = 0;
372 int i;
373
374 if (nr_nodes <= 0)
375 return -1;
376 if (nr_nodes > MAX_NUMNODES) {
377 pr_info("numa=fake=%d too large, reducing to %d\n",
378 nr_nodes, MAX_NUMNODES);
379 nr_nodes = MAX_NUMNODES;
380 }
381
382 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
383 /*
384 * Calculate the number of big nodes that can be allocated as a result
385 * of consolidating the remainder.
386 */
387 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
388 FAKE_NODE_MIN_SIZE;
389
390 size &= FAKE_NODE_MIN_HASH_MASK;
391 if (!size) {
392 pr_err("Not enough memory for each node. "
393 "NUMA emulation disabled.\n");
394 return -1;
395 }
396 313
397 for (i = 0; i < MAX_NUMNODES; i++) 314 for (j = i + 1; j < mi->nr_blks; j++) {
398 if (physnodes[i].start != physnodes[i].end) 315 struct numa_memblk *bj = &mi->blk[j];
399 node_set(i, physnode_mask); 316 unsigned long start, end;
400
401 /*
402 * Continue to fill physical nodes with fake nodes until there is no
403 * memory left on any of them.
404 */
405 while (nodes_weight(physnode_mask)) {
406 for_each_node_mask(i, physnode_mask) {
407 u64 end = physnodes[i].start + size;
408 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
409
410 if (ret < big)
411 end += FAKE_NODE_MIN_SIZE;
412 317
413 /* 318 /*
414 * Continue to add memory to this fake node if its 319 * See whether there are overlapping blocks. Whine
415 * non-reserved memory is less than the per-node size. 320 * about but allow overlaps of the same nid. They
321 * will be merged below.
416 */ 322 */
417 while (end - physnodes[i].start - 323 if (bi->end > bj->start && bi->start < bj->end) {
418 memblock_x86_hole_size(physnodes[i].start, end) < size) { 324 if (bi->nid != bj->nid) {
419 end += FAKE_NODE_MIN_SIZE; 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
420 if (end > physnodes[i].end) { 326 bi->nid, bi->start, bi->end,
421 end = physnodes[i].end; 327 bj->nid, bj->start, bj->end);
422 break; 328 return -EINVAL;
423 } 329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
424 } 333 }
425 334
426 /* 335 /*
427 * If there won't be at least FAKE_NODE_MIN_SIZE of 336 * Join together blocks on the same node, holes
428 * non-reserved memory in ZONE_DMA32 for the next node, 337 * between which don't overlap with memory on other
429 * this one must extend to the boundary. 338 * nodes.
430 */ 339 */
431 if (end < dma32_end && dma32_end - end - 340 if (bi->nid != bj->nid)
432 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 341 continue;
433 end = dma32_end; 342 start = max(min(bi->start, bj->start), low);
434 343 end = min(max(bi->end, bj->end), high);
435 /* 344 for (k = 0; k < mi->nr_blks; k++) {
436 * If there won't be enough non-reserved memory for the 345 struct numa_memblk *bk = &mi->blk[k];
437 * next node, this one must extend to the end of the 346
438 * physical node. 347 if (bi->nid == bk->nid)
439 */ 348 continue;
440 if (physnodes[i].end - end - 349 if (start < bk->end && end > bk->start)
441 memblock_x86_hole_size(end, physnodes[i].end) < size) 350 break;
442 end = physnodes[i].end; 351 }
443 352 if (k < mi->nr_blks)
444 /* 353 continue;
445 * Avoid allocating more nodes than requested, which can 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
446 * happen as a result of rounding down each node's size 355 bi->nid, bi->start, bi->end, bj->start, bj->end,
447 * to FAKE_NODE_MIN_SIZE. 356 start, end);
448 */ 357 bi->start = start;
449 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 358 bi->end = end;
450 end = physnodes[i].end; 359 numa_remove_memblk_from(j--, mi);
451
452 if (setup_node_range(ret++, &physnodes[i].start,
453 end - physnodes[i].start,
454 physnodes[i].end) < 0)
455 node_clear(i, physnode_mask);
456 } 360 }
457 } 361 }
458 return ret;
459}
460 362
461/* 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
462 * Returns the end address of a node so that there is at least `size' amount of 364 mi->blk[i].start = mi->blk[i].end = 0;
463 * non-reserved memory or `max_addr' is reached. 365 mi->blk[i].nid = NUMA_NO_NODE;
464 */
465static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
466{
467 u64 end = start + size;
468
469 while (end - start - memblock_x86_hole_size(start, end) < size) {
470 end += FAKE_NODE_MIN_SIZE;
471 if (end > max_addr) {
472 end = max_addr;
473 break;
474 }
475 } 366 }
476 return end; 367
368 return 0;
477} 369}
478 370
479/* 371/*
480 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 372 * Set nodes, which have memory in @mi, in *@nodemask.
481 * `addr' to `max_addr'. The return value is the number of nodes allocated.
482 */ 373 */
483static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
484{ 376{
485 nodemask_t physnode_mask = NODE_MASK_NONE;
486 u64 min_size;
487 int ret = 0;
488 int i; 377 int i;
489 378
490 if (!size) 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
491 return -1; 380 if (mi->blk[i].start != mi->blk[i].end &&
492 /* 381 mi->blk[i].nid != NUMA_NO_NODE)
493 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 382 node_set(mi->blk[i].nid, *nodemask);
494 * increased accordingly if the requested size is too small. This 383}
495 * creates a uniform distribution of node sizes across the entire
496 * machine (but not necessarily over physical nodes).
497 */
498 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
499 MAX_NUMNODES;
500 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
501 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
502 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
503 FAKE_NODE_MIN_HASH_MASK;
504 if (size < min_size) {
505 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
506 size >> 20, min_size >> 20);
507 size = min_size;
508 }
509 size &= FAKE_NODE_MIN_HASH_MASK;
510
511 for (i = 0; i < MAX_NUMNODES; i++)
512 if (physnodes[i].start != physnodes[i].end)
513 node_set(i, physnode_mask);
514 /*
515 * Fill physical nodes with fake nodes of size until there is no memory
516 * left on any of them.
517 */
518 while (nodes_weight(physnode_mask)) {
519 for_each_node_mask(i, physnode_mask) {
520 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
521 u64 end;
522
523 end = find_end_of_node(physnodes[i].start,
524 physnodes[i].end, size);
525 /*
526 * If there won't be at least FAKE_NODE_MIN_SIZE of
527 * non-reserved memory in ZONE_DMA32 for the next node,
528 * this one must extend to the boundary.
529 */
530 if (end < dma32_end && dma32_end - end -
531 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
532 end = dma32_end;
533 384
534 /* 385/**
535 * If there won't be enough non-reserved memory for the 386 * numa_reset_distance - Reset NUMA distance table
536 * next node, this one must extend to the end of the 387 *
537 * physical node. 388 * The current table is freed. The next numa_set_distance() call will
538 */ 389 * create a new one.
539 if (physnodes[i].end - end - 390 */
540 memblock_x86_hole_size(end, physnodes[i].end) < size) 391void __init numa_reset_distance(void)
541 end = physnodes[i].end; 392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
542 394
543 /* 395 /* numa_distance could be 1LU marking allocation failure, test cnt */
544 * Setup the fake node that will be allocated as bootmem 396 if (numa_distance_cnt)
545 * later. If setup_node_range() returns non-zero, there 397 memblock_x86_free_range(__pa(numa_distance),
546 * is no more memory available on this physical node. 398 __pa(numa_distance) + size);
547 */ 399 numa_distance_cnt = 0;
548 if (setup_node_range(ret++, &physnodes[i].start, 400 numa_distance = NULL; /* enable table creation */
549 end - physnodes[i].start,
550 physnodes[i].end) < 0)
551 node_clear(i, physnode_mask);
552 }
553 }
554 return ret;
555} 401}
556 402
557/* 403static int __init numa_alloc_distance(void)
558 * Sets up the system RAM area from start_pfn to last_pfn according to the
559 * numa=fake command-line option.
560 */
561static int __init numa_emulation(unsigned long start_pfn,
562 unsigned long last_pfn, int acpi, int amd)
563{ 404{
564 u64 addr = start_pfn << PAGE_SHIFT; 405 nodemask_t nodes_parsed;
565 u64 max_addr = last_pfn << PAGE_SHIFT; 406 size_t size;
566 int num_nodes; 407 int i, j, cnt = 0;
567 int i; 408 u64 phys;
568 409
569 /* 410 /* size the new table and allocate it */
570 * If the numa=fake command-line contains a 'M' or 'G', it represents 411 nodes_parsed = numa_nodes_parsed;
571 * the fixed node size. Otherwise, if it is just a single number N, 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
572 * split the system RAM into N fake nodes.
573 */
574 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
575 u64 size;
576 413
577 size = memparse(cmdline, &cmdline); 414 for_each_node_mask(i, nodes_parsed)
578 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 415 cnt = i;
579 } else { 416 cnt++;
580 unsigned long n; 417 size = cnt * cnt * sizeof(numa_distance[0]);
581 418
582 n = simple_strtoul(cmdline, NULL, 0); 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
583 num_nodes = split_nodes_interleave(addr, max_addr, n); 420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
584 } 426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
585 428
586 if (num_nodes < 0) 429 numa_distance = __va(phys);
587 return num_nodes; 430 numa_distance_cnt = cnt;
588 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 431
589 if (memnode_shift < 0) { 432 /* fill with the default distances */
590 memnode_shift = 0; 433 for (i = 0; i < cnt; i++)
591 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 434 for (j = 0; j < cnt; j++)
592 "disabled.\n"); 435 numa_distance[i * cnt + j] = i == j ?
593 return -1; 436 LOCAL_DISTANCE : REMOTE_DISTANCE;
594 } 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
595 438
596 /*
597 * We need to vacate all active ranges that may have been registered for
598 * the e820 memory map.
599 */
600 remove_all_active_ranges();
601 for_each_node_mask(i, node_possible_map) {
602 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
603 nodes[i].end >> PAGE_SHIFT);
604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
605 }
606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
608 numa_init_array();
609 return 0; 439 return 0;
610} 440}
611#endif /* CONFIG_NUMA_EMU */
612 441
613void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 442/**
614 int acpi, int amd) 443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
615{ 461{
616 int i; 462 if (!numa_distance && numa_alloc_distance() < 0)
617
618 nodes_clear(node_possible_map);
619 nodes_clear(node_online_map);
620
621#ifdef CONFIG_NUMA_EMU
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
625 return; 463 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
628 nodes_clear(node_possible_map);
629 nodes_clear(node_online_map);
630#endif
631 464
632#ifdef CONFIG_ACPI_NUMA 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
633 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
634 last_pfn << PAGE_SHIFT)) 467 from, to, distance);
635 return; 468 return;
636 nodes_clear(node_possible_map); 469 }
637 nodes_clear(node_online_map);
638#endif
639 470
640#ifdef CONFIG_AMD_NUMA 471 if ((u8)distance != distance ||
641 if (!numa_off && amd && !amd_scan_nodes()) 472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
642 return; 475 return;
643 nodes_clear(node_possible_map); 476 }
644 nodes_clear(node_online_map);
645#endif
646 printk(KERN_INFO "%s\n",
647 numa_off ? "NUMA turned off" : "No NUMA configuration found");
648 477
649 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 478 numa_distance[from * numa_distance_cnt + to] = distance;
650 start_pfn << PAGE_SHIFT,
651 last_pfn << PAGE_SHIFT);
652 /* setup dummy node covering all memory */
653 memnode_shift = 63;
654 memnodemap = memnode.embedded_map;
655 memnodemap[0] = 0;
656 node_set_online(0);
657 node_set(0, node_possible_map);
658 for (i = 0; i < nr_cpu_ids; i++)
659 numa_set_node(i, 0);
660 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
661 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
662} 479}
663 480
664unsigned long __init numa_free_all_bootmem(void) 481int __node_distance(int from, int to)
665{ 482{
666 unsigned long pages = 0; 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
667 int i; 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
668 488
669 for_each_online_node(i) 489/*
670 pages += free_all_bootmem_node(NODE_DATA(i)); 490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
671 497
672 pages += free_all_memory_core_early(MAX_NUMNODES); 498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
673 507
674 return pages; 508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
675} 518}
676 519
677#ifdef CONFIG_NUMA 520static int __init numa_register_memblks(struct numa_meminfo *mi)
678
679static __init int find_near_online_node(int node)
680{ 521{
681 int n, val; 522 int i, nid;
682 int min_val = INT_MAX;
683 int best_node = -1;
684 523
685 for_each_online_node(n) { 524 /* Account for nodes with cpus and no memory */
686 val = node_distance(node, n); 525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
529
530 memnode_shift = compute_hash_shift(mi);
531 if (memnode_shift < 0) {
532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
687 535
688 if (val < min_val) { 536 for (i = 0; i < mi->nr_blks; i++)
689 min_val = val; 537 memblock_x86_register_active_regions(mi->blk[i].nid,
690 best_node = n; 538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
691 } 556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
692 } 560 }
693 561
694 return best_node; 562 return 0;
695} 563}
696 564
697/* 565/**
698 * Setup early cpu_to_node. 566 * dummy_numma_init - Fallback dummy NUMA init
699 * 567 *
700 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 568 * Used if there's no underlying NUMA architecture, NUMA initialization
701 * and apicid_to_node[] tables have valid entries for a CPU. 569 * fails, or NUMA is disabled on the command line.
702 * This means we skip cpu_to_node[] initialisation for NUMA
703 * emulation and faking node case (when running a kernel compiled
704 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
705 * is already initialized in a round robin manner at numa_init_array,
706 * prior to this call, and this initialization is good enough
707 * for the fake NUMA cases.
708 * 570 *
709 * Called before the per_cpu areas are setup. 571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
710 */ 573 */
711void __init init_cpu_to_node(void) 574static int __init dummy_numa_init(void)
712{ 575{
713 int cpu; 576 printk(KERN_INFO "%s\n",
714 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
715 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
716 BUG_ON(cpu_to_apicid == NULL); 579 0LU, max_pfn << PAGE_SHIFT);
717 580
718 for_each_possible_cpu(cpu) { 581 node_set(0, numa_nodes_parsed);
719 int node; 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
720 u16 apicid = cpu_to_apicid[cpu];
721 583
722 if (apicid == BAD_APICID) 584 return 0;
723 continue;
724 node = apicid_to_node[apicid];
725 if (node == NUMA_NO_NODE)
726 continue;
727 if (!node_online(node))
728 node = find_near_online_node(node);
729 numa_set_node(cpu, node);
730 }
731} 585}
732#endif
733 586
734 587static int __init numa_init(int (*init_func)(void))
735void __cpuinit numa_set_node(int cpu, int node)
736{ 588{
737 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 589 int i;
738 590 int ret;
739 /* early setting, no percpu area yet */
740 if (cpu_to_node_map) {
741 cpu_to_node_map[cpu] = node;
742 return;
743 }
744
745#ifdef CONFIG_DEBUG_PER_CPU_MAPS
746 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
747 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
748 dump_stack();
749 return;
750 }
751#endif
752 per_cpu(x86_cpu_to_node_map, cpu) = node;
753 591
754 if (node != NUMA_NO_NODE) 592 for (i = 0; i < MAX_LOCAL_APIC; i++)
755 set_cpu_numa_node(cpu, node); 593 set_apicid_to_node(i, NUMA_NO_NODE);
756}
757 594
758void __cpuinit numa_clear_node(int cpu) 595 nodes_clear(numa_nodes_parsed);
759{ 596 nodes_clear(node_possible_map);
760 numa_set_node(cpu, NUMA_NO_NODE); 597 nodes_clear(node_online_map);
761} 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
599 remove_all_active_ranges();
600 numa_reset_distance();
762 601
763#ifndef CONFIG_DEBUG_PER_CPU_MAPS 602 ret = init_func();
603 if (ret < 0)
604 return ret;
605 ret = numa_cleanup_meminfo(&numa_meminfo);
606 if (ret < 0)
607 return ret;
764 608
765#ifndef CONFIG_NUMA_EMU 609 numa_emulation(&numa_meminfo, numa_distance_cnt);
766void __cpuinit numa_add_cpu(int cpu)
767{
768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
769}
770 610
771void __cpuinit numa_remove_cpu(int cpu) 611 ret = numa_register_memblks(&numa_meminfo);
772{ 612 if (ret < 0)
773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 613 return ret;
774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782 614
783 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 615 for (i = 0; i < nr_cpu_ids; i++) {
784 if (apicid != BAD_APICID) 616 int nid = early_cpu_to_node(i);
785 nid = apicid_to_node[apicid];
786 if (nid == NUMA_NO_NODE)
787 nid = early_cpu_to_node(cpu);
788 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
789
790 /*
791 * Use the starting address of the emulated node to find which physical
792 * node it is allocated on.
793 */
794 addr = node_start_pfn(nid) << PAGE_SHIFT;
795 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
796 if (addr >= physnodes[physnid].start &&
797 addr < physnodes[physnid].end)
798 break;
799 617
800 /* 618 if (nid == NUMA_NO_NODE)
801 * Map the cpu to each emulated node that is allocated on the physical 619 continue;
802 * node of the cpu's apic id. 620 if (!node_online(nid))
803 */ 621 numa_clear_node(i);
804 for_each_online_node(nid) {
805 addr = node_start_pfn(nid) << PAGE_SHIFT;
806 if (addr >= physnodes[physnid].start &&
807 addr < physnodes[physnid].end)
808 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
809 } 622 }
623 numa_init_array();
624 return 0;
810} 625}
811 626
812void __cpuinit numa_remove_cpu(int cpu) 627void __init initmem_init(void)
813{ 628{
814 int i; 629 int ret;
815 630
816 for_each_online_node(i) 631 if (!numa_off) {
817 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 632#ifdef CONFIG_ACPI_NUMA
818} 633 ret = numa_init(x86_acpi_numa_init);
819#endif /* !CONFIG_NUMA_EMU */ 634 if (!ret)
820 635 return;
821#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 636#endif
822static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 637#ifdef CONFIG_AMD_NUMA
823{ 638 ret = numa_init(amd_numa_init);
824 int node = early_cpu_to_node(cpu); 639 if (!ret)
825 struct cpumask *mask; 640 return;
826 char buf[64]; 641#endif
827
828 mask = node_to_cpumask_map[node];
829 if (!mask) {
830 pr_err("node_to_cpumask_map[%i] NULL\n", node);
831 dump_stack();
832 return NULL;
833 } 642 }
834 643
835 cpulist_scnprintf(buf, sizeof(buf), mask); 644 numa_init(dummy_numa_init);
836 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
837 enable ? "numa_add_cpu" : "numa_remove_cpu",
838 cpu, node, buf);
839 return mask;
840} 645}
841 646
842/* 647unsigned long __init numa_free_all_bootmem(void)
843 * --------- debug versions of the numa functions ---------
844 */
845#ifndef CONFIG_NUMA_EMU
846static void __cpuinit numa_set_cpumask(int cpu, int enable)
847{
848 struct cpumask *mask;
849
850 mask = debug_cpumask_set_cpu(cpu, enable);
851 if (!mask)
852 return;
853
854 if (enable)
855 cpumask_set_cpu(cpu, mask);
856 else
857 cpumask_clear_cpu(cpu, mask);
858}
859#else
860static void __cpuinit numa_set_cpumask(int cpu, int enable)
861{ 648{
862 int node = early_cpu_to_node(cpu); 649 unsigned long pages = 0;
863 struct cpumask *mask;
864 int i; 650 int i;
865 651
866 for_each_online_node(i) { 652 for_each_online_node(i)
867 unsigned long addr; 653 pages += free_all_bootmem_node(NODE_DATA(i));
868
869 addr = node_start_pfn(i) << PAGE_SHIFT;
870 if (addr < physnodes[node].start ||
871 addr >= physnodes[node].end)
872 continue;
873 mask = debug_cpumask_set_cpu(cpu, enable);
874 if (!mask)
875 return;
876
877 if (enable)
878 cpumask_set_cpu(cpu, mask);
879 else
880 cpumask_clear_cpu(cpu, mask);
881 }
882}
883#endif /* CONFIG_NUMA_EMU */
884 654
885void __cpuinit numa_add_cpu(int cpu) 655 pages += free_all_memory_core_early(MAX_NUMNODES);
886{
887 numa_set_cpumask(cpu, 1);
888}
889 656
890void __cpuinit numa_remove_cpu(int cpu) 657 return pages;
891{
892 numa_set_cpumask(cpu, 0);
893} 658}
894 659
895int __cpu_to_node(int cpu) 660int __cpuinit numa_cpu_node(int cpu)
896{ 661{
897 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
898 printk(KERN_WARNING
899 "cpu_to_node(%d): usage too early!\n", cpu);
900 dump_stack();
901 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
902 }
903 return per_cpu(x86_cpu_to_node_map, cpu);
904}
905EXPORT_SYMBOL(__cpu_to_node);
906 663
907/* 664 if (apicid != BAD_APICID)
908 * Same function as cpu_to_node() but used if called before the 665 return __apicid_to_node[apicid];
909 * per_cpu areas are setup. 666 return NUMA_NO_NODE;
910 */
911int early_cpu_to_node(int cpu)
912{
913 if (early_per_cpu_ptr(x86_cpu_to_node_map))
914 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
915
916 if (!cpu_possible(cpu)) {
917 printk(KERN_WARNING
918 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
919 dump_stack();
920 return NUMA_NO_NODE;
921 }
922 return per_cpu(x86_cpu_to_node_map, cpu);
923} 667}
924
925/*
926 * --------- end of debug versions of the numa functions ---------
927 */
928
929#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..de84cc140379
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,486 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/**
271 * numa_emulation - Emulate NUMA nodes
272 * @numa_meminfo: NUMA configuration to massage
273 * @numa_dist_cnt: The size of the physical NUMA distance table
274 *
275 * Emulate NUMA nodes according to the numa=fake kernel parameter.
276 * @numa_meminfo contains the physical memory configuration and is modified
277 * to reflect the emulated configuration on success. @numa_dist_cnt is
278 * used to determine the size of the physical distance table.
279 *
280 * On success, the following modifications are made.
281 *
282 * - @numa_meminfo is updated to reflect the emulated nodes.
283 *
284 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
285 * emulated nodes.
286 *
287 * - NUMA distance table is rebuilt to represent distances between emulated
288 * nodes. The distances are determined considering how emulated nodes
289 * are mapped to physical nodes and match the actual distances.
290 *
291 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
292 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
293 *
294 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
295 * identity mapping and no other modification is made.
296 */
297void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{
299 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT;
302 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid;
305 int i, j, ret;
306
307 if (!emu_cmdline)
308 goto no_emu;
309
310 memset(&ei, 0, sizeof(ei));
311 pi = *numa_meminfo;
312
313 for (i = 0; i < MAX_NUMNODES; i++)
314 emu_nid_to_phys[i] = NUMA_NO_NODE;
315
316 /*
317 * If the numa=fake command-line contains a 'M' or 'G', it represents
318 * the fixed node size. Otherwise, if it is just a single number N,
319 * split the system RAM into N fake nodes.
320 */
321 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
322 u64 size;
323
324 size = memparse(emu_cmdline, &emu_cmdline);
325 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
326 } else {
327 unsigned long n;
328
329 n = simple_strtoul(emu_cmdline, NULL, 0);
330 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
331 }
332
333 if (ret < 0)
334 goto no_emu;
335
336 if (numa_cleanup_meminfo(&ei) < 0) {
337 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
338 goto no_emu;
339 }
340
341 /* copy the physical distance table */
342 if (numa_dist_cnt) {
343 u64 phys;
344
345 phys = memblock_find_in_range(0,
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
350 goto no_emu;
351 }
352 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
353 phys_dist = __va(phys);
354
355 for (i = 0; i < numa_dist_cnt; i++)
356 for (j = 0; j < numa_dist_cnt; j++)
357 phys_dist[i * numa_dist_cnt + j] =
358 node_distance(i, j);
359 }
360
361 /*
362 * Determine the max emulated nid and the default phys nid to use
363 * for unmapped nodes.
364 */
365 max_emu_nid = 0;
366 dfl_phys_nid = NUMA_NO_NODE;
367 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
368 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
369 max_emu_nid = i;
370 if (dfl_phys_nid == NUMA_NO_NODE)
371 dfl_phys_nid = emu_nid_to_phys[i];
372 }
373 }
374 if (dfl_phys_nid == NUMA_NO_NODE) {
375 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
376 goto no_emu;
377 }
378
379 /* commit */
380 *numa_meminfo = ei;
381
382 /*
383 * Transform __apicid_to_node table to use emulated nids by
384 * reverse-mapping phys_nid. The maps should always exist but fall
385 * back to zero just in case.
386 */
387 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
388 if (__apicid_to_node[i] == NUMA_NO_NODE)
389 continue;
390 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
391 if (__apicid_to_node[i] == emu_nid_to_phys[j])
392 break;
393 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
394 }
395
396 /* make sure all emulated nodes are mapped to a physical node */
397 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
398 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
399 emu_nid_to_phys[i] = dfl_phys_nid;
400
401 /* transform distance table */
402 numa_reset_distance();
403 for (i = 0; i < max_emu_nid + 1; i++) {
404 for (j = 0; j < max_emu_nid + 1; j++) {
405 int physi = emu_nid_to_phys[i];
406 int physj = emu_nid_to_phys[j];
407 int dist;
408
409 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
410 dist = physi == physj ?
411 LOCAL_DISTANCE : REMOTE_DISTANCE;
412 else
413 dist = phys_dist[physi * numa_dist_cnt + physj];
414
415 numa_set_distance(i, j, dist);
416 }
417 }
418
419 /* free the copied physical distance table */
420 if (phys_dist)
421 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
422 return;
423
424no_emu:
425 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
426 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
427 emu_nid_to_phys[i] = i;
428}
429
430#ifndef CONFIG_DEBUG_PER_CPU_MAPS
431void __cpuinit numa_add_cpu(int cpu)
432{
433 int physnid, nid;
434
435 nid = early_cpu_to_node(cpu);
436 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
437
438 physnid = emu_nid_to_phys[nid];
439
440 /*
441 * Map the cpu to each emulated node that is allocated on the physical
442 * node of the cpu's apic id.
443 */
444 for_each_online_node(nid)
445 if (emu_nid_to_phys[nid] == physnid)
446 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
447}
448
449void __cpuinit numa_remove_cpu(int cpu)
450{
451 int i;
452
453 for_each_online_node(i)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, bool enable)
458{
459 int nid, physnid;
460
461 nid = early_cpu_to_node(cpu);
462 if (nid == NUMA_NO_NODE) {
463 /* early_cpu_to_node() already emits a warning and trace */
464 return;
465 }
466
467 physnid = emu_nid_to_phys[nid];
468
469 for_each_online_node(nid) {
470 if (emu_nid_to_phys[nid] != physnid)
471 continue;
472
473 debug_cpumask_set_cpu(cpu, nid, enable);
474 }
475}
476
477void __cpuinit numa_add_cpu(int cpu)
478{
479 numa_set_cpumask(cpu, true);
480}
481
482void __cpuinit numa_remove_cpu(int cpu)
483{
484 numa_set_cpumask(cpu, false);
485}
486#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index d343b3c81f3c..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
57 57
58void update_page_count(int level, unsigned long pages) 58void update_page_count(int level, unsigned long pages)
59{ 59{
60 unsigned long flags;
61
62 /* Protect against CPA */ 60 /* Protect against CPA */
63 spin_lock_irqsave(&pgd_lock, flags); 61 spin_lock(&pgd_lock);
64 direct_pages_count[level] += pages; 62 direct_pages_count[level] += pages;
65 spin_unlock_irqrestore(&pgd_lock, flags); 63 spin_unlock(&pgd_lock);
66} 64}
67 65
68static void split_page_count(int level) 66static void split_page_count(int level)
@@ -312,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
312 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
313 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
314 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
315 * aswell. 313 * as well.
316 */ 314 */
317 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
318 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
@@ -394,7 +392,7 @@ static int
394try_preserve_large_page(pte_t *kpte, unsigned long address, 392try_preserve_large_page(pte_t *kpte, unsigned long address,
395 struct cpa_data *cpa) 393 struct cpa_data *cpa)
396{ 394{
397 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 395 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
398 pte_t new_pte, old_pte, *tmp; 396 pte_t new_pte, old_pte, *tmp;
399 pgprot_t old_prot, new_prot, req_prot; 397 pgprot_t old_prot, new_prot, req_prot;
400 int i, do_split = 1; 398 int i, do_split = 1;
@@ -403,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
403 if (cpa->force_split) 401 if (cpa->force_split)
404 return 1; 402 return 1;
405 403
406 spin_lock_irqsave(&pgd_lock, flags); 404 spin_lock(&pgd_lock);
407 /* 405 /*
408 * Check for races, another CPU might have split this page 406 * Check for races, another CPU might have split this page
409 * up already: 407 * up already:
@@ -498,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
498 } 496 }
499 497
500out_unlock: 498out_unlock:
501 spin_unlock_irqrestore(&pgd_lock, flags); 499 spin_unlock(&pgd_lock);
502 500
503 return do_split; 501 return do_split;
504} 502}
505 503
506static int split_large_page(pte_t *kpte, unsigned long address) 504static int split_large_page(pte_t *kpte, unsigned long address)
507{ 505{
508 unsigned long flags, pfn, pfninc = 1; 506 unsigned long pfn, pfninc = 1;
509 unsigned int i, level; 507 unsigned int i, level;
510 pte_t *pbase, *tmp; 508 pte_t *pbase, *tmp;
511 pgprot_t ref_prot; 509 pgprot_t ref_prot;
@@ -519,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
519 if (!base) 517 if (!base)
520 return -ENOMEM; 518 return -ENOMEM;
521 519
522 spin_lock_irqsave(&pgd_lock, flags); 520 spin_lock(&pgd_lock);
523 /* 521 /*
524 * Check for races, another CPU might have split this page 522 * Check for races, another CPU might have split this page
525 * up for us already: 523 * up for us already:
@@ -591,7 +589,7 @@ out_unlock:
591 */ 589 */
592 if (base) 590 if (base)
593 __free_page(base); 591 __free_page(base);
594 spin_unlock_irqrestore(&pgd_lock, flags); 592 spin_unlock(&pgd_lock);
595 593
596 return 0; 594 return 0;
597} 595}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 500242d3c96d..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
121 121
122static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
123{ 123{
124 unsigned long flags; /* can be called from interrupt context */
125
126 if (SHARED_KERNEL_PMD) 124 if (SHARED_KERNEL_PMD)
127 return; 125 return;
128 126
129 spin_lock_irqsave(&pgd_lock, flags); 127 spin_lock(&pgd_lock);
130 pgd_list_del(pgd); 128 pgd_list_del(pgd);
131 spin_unlock_irqrestore(&pgd_lock, flags); 129 spin_unlock(&pgd_lock);
132} 130}
133 131
134/* 132/*
@@ -170,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
170 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
171 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
172 */ 170 */
173 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
174 write_cr3(read_cr3());
175} 172}
176#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
177 174
@@ -260,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
260{ 257{
261 pgd_t *pgd; 258 pgd_t *pgd;
262 pmd_t *pmds[PREALLOCATED_PMDS]; 259 pmd_t *pmds[PREALLOCATED_PMDS];
263 unsigned long flags;
264 260
265 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 261 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
266 262
@@ -280,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
280 * respect to anything walking the pgd_list, so that they 276 * respect to anything walking the pgd_list, so that they
281 * never see a partially populated pgd. 277 * never see a partially populated pgd.
282 */ 278 */
283 spin_lock_irqsave(&pgd_lock, flags); 279 spin_lock(&pgd_lock);
284 280
285 pgd_ctor(mm, pgd); 281 pgd_ctor(mm, pgd);
286 pgd_prepopulate_pmd(mm, pgd, pmds); 282 pgd_prepopulate_pmd(mm, pgd, pmds);
287 283
288 spin_unlock_irqrestore(&pgd_lock, flags); 284 spin_unlock(&pgd_lock);
289 285
290 return pgd; 286 return pgd;
291 287
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051d..364f36bdfad8 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; 57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58 58
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61 61
62int acpi_numa __initdata; 62int acpi_numa __initdata;
63 63
@@ -211,10 +211,12 @@ int __init get_memcfg_from_srat(void)
211{ 211{
212 int i, j, nid; 212 int i, j, nid;
213 213
214
215 if (srat_disabled()) 214 if (srat_disabled())
216 goto out_fail; 215 goto out_fail;
217 216
217 if (acpi_numa_init() < 0)
218 goto out_fail;
219
218 if (num_memory_chunks == 0) { 220 if (num_memory_chunks == 0) {
219 printk(KERN_DEBUG 221 printk(KERN_DEBUG
220 "could not find any ACPI SRAT memory areas.\n"); 222 "could not find any ACPI SRAT memory areas.\n");
@@ -254,8 +256,8 @@ int __init get_memcfg_from_srat(void)
254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n", 256 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
255 num_memory_chunks); 257 num_memory_chunks);
256 258
257 for (i = 0; i < MAX_APICID; i++) 259 for (i = 0; i < MAX_LOCAL_APIC; i++)
258 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 260 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
259 261
260 for (j = 0; j < num_memory_chunks; j++){ 262 for (j = 0; j < num_memory_chunks; j++){
261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 263 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
32static nodemask_t cpu_nodes_parsed __initdata;
33static struct bootnode nodes[MAX_NUMNODES] __initdata;
34static struct bootnode nodes_add[MAX_NUMNODES]; 29static struct bootnode nodes_add[MAX_NUMNODES];
35 30
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
40static __init int setup_node(int pxm) 31static __init int setup_node(int pxm)
41{ 32{
42 return acpi_map_pxm_to_node(pxm); 33 return acpi_map_pxm_to_node(pxm);
43} 34}
44 35
45static __init int conflicting_memblks(unsigned long start, unsigned long end)
46{
47 int i;
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
53 return memblk_nodeid[i];
54 if (nd->end == end && nd->start == start)
55 return memblk_nodeid[i];
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
62 struct bootnode *nd = &nodes[i];
63
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void) 36static __init void bad_srat(void)
77{ 37{
78 int i;
79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 38 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1; 39 acpi_numa = -1;
81 for (i = 0; i < MAX_LOCAL_APIC; i++) 40 memset(nodes_add, 0, sizeof(nodes_add));
82 apicid_to_node[i] = NUMA_NO_NODE;
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
87 remove_all_active_ranges();
88} 41}
89 42
90static __init inline int srat_disabled(void) 43static __init inline int srat_disabled(void)
91{ 44{
92 return numa_off || acpi_numa < 0; 45 return acpi_numa < 0;
93} 46}
94 47
95/* Callback for SLIT parsing */ 48/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 49void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{ 50{
98 unsigned length; 51 int i, j;
99 unsigned long phys;
100
101 length = slit->header.length;
102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
103 PAGE_SIZE);
104
105 if (phys == MEMBLOCK_ERROR)
106 panic(" Can not save slit!\n");
107 52
108 acpi_slit = __va(phys); 53 for (i = 0; i < slit->locality_count; i++)
109 memcpy(acpi_slit, slit, length); 54 for (j = 0; j < slit->locality_count; j++)
110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); 55 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
56 slit->entry[slit->locality_count * i + j]);
111} 57}
112 58
113/* Callback for Proximity Domain -> x2APIC mapping */ 59/* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 84 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return; 85 return;
140 } 86 }
141 apicid_to_node[apic_id] = node; 87 set_apicid_to_node(apic_id, node);
142 node_set(node, cpu_nodes_parsed); 88 node_set(node, numa_nodes_parsed);
143 acpi_numa = 1; 89 acpi_numa = 1;
144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 90 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
145 pxm, apic_id, node); 91 pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
178 return; 124 return;
179 } 125 }
180 126
181 apicid_to_node[apic_id] = node; 127 set_apicid_to_node(apic_id, node);
182 node_set(node, cpu_nodes_parsed); 128 node_set(node, numa_nodes_parsed);
183 acpi_numa = 1; 129 acpi_numa = 1;
184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 130 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
185 pxm, apic_id, node); 131 pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
241 } 187 }
242 188
243 if (changed) { 189 if (changed) {
244 node_set(node, cpu_nodes_parsed); 190 node_set(node, numa_nodes_parsed);
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end); 192 nd->start, nd->end);
247 } 193 }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
251void __init 197void __init
252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
253{ 199{
254 struct bootnode *nd, oldnode;
255 unsigned long start, end; 200 unsigned long start, end;
256 int node, pxm; 201 int node, pxm;
257 int i;
258 202
259 if (srat_disabled()) 203 if (srat_disabled())
260 return; 204 return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
276 bad_srat(); 220 bad_srat();
277 return; 221 return;
278 } 222 }
279 i = conflicting_memblks(start, end); 223
280 if (i == node) { 224 if (numa_add_memblk(node, start, end) < 0) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
285 printk(KERN_ERR
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
289 bad_srat(); 225 bad_srat();
290 return; 226 return;
291 } 227 }
292 nd = &nodes[node];
293 oldnode = *nd;
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
303 228
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end); 230 start, end);
306 231
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
308 update_nodes_add(node, start, end); 233 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
319}
320
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
323static int __init nodes_cover_memory(const struct bootnode *nodes)
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
333 pxmram -= __absent_pages_in_range(i, s, e);
334 if ((long)pxmram < 0)
335 pxmram = 0;
336 }
337
338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348} 234}
349 235
350void __init acpi_numa_arch_fixup(void) {} 236void __init acpi_numa_arch_fixup(void) {}
351 237
352#ifdef CONFIG_NUMA_EMU 238int __init x86_acpi_numa_init(void)
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
355{
356 int i;
357
358 for_each_node_mask(i, nodes_parsed) {
359 cutoff_node(i, start, end);
360 physnodes[i].start = nodes[i].start;
361 physnodes[i].end = nodes[i].end;
362 }
363}
364#endif /* CONFIG_NUMA_EMU */
365
366/* Use the information discovered above to actually set up the nodes. */
367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
368{ 239{
369 int i; 240 int ret;
370
371 if (acpi_numa <= 0)
372 return -1;
373
374 /* First clean up the node list */
375 for (i = 0; i < MAX_NUMNODES; i++)
376 cutoff_node(i, start, end);
377
378 /*
379 * Join together blocks on the same node, holes between
380 * which don't overlap with memory on other nodes.
381 */
382 for (i = 0; i < num_node_memblks; ++i) {
383 int j, k;
384
385 for (j = i + 1; j < num_node_memblks; ++j) {
386 unsigned long start, end;
387
388 if (memblk_nodeid[i] != memblk_nodeid[j])
389 continue;
390 start = min(node_memblk_range[i].end,
391 node_memblk_range[j].end);
392 end = max(node_memblk_range[i].start,
393 node_memblk_range[j].start);
394 for (k = 0; k < num_node_memblks; ++k) {
395 if (memblk_nodeid[i] == memblk_nodeid[k])
396 continue;
397 if (start < node_memblk_range[k].end &&
398 end > node_memblk_range[k].start)
399 break;
400 }
401 if (k < num_node_memblks)
402 continue;
403 start = min(node_memblk_range[i].start,
404 node_memblk_range[j].start);
405 end = max(node_memblk_range[i].end,
406 node_memblk_range[j].end);
407 printk(KERN_INFO "SRAT: Node %d "
408 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
409 memblk_nodeid[i],
410 node_memblk_range[i].start,
411 node_memblk_range[i].end,
412 node_memblk_range[j].start,
413 node_memblk_range[j].end,
414 start, end);
415 node_memblk_range[i].start = start;
416 node_memblk_range[i].end = end;
417 k = --num_node_memblks - j;
418 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
419 k * sizeof(*memblk_nodeid));
420 memmove(node_memblk_range + j, node_memblk_range + j+1,
421 k * sizeof(*node_memblk_range));
422 --j;
423 }
424 }
425
426 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
427 memblk_nodeid);
428 if (memnode_shift < 0) {
429 printk(KERN_ERR
430 "SRAT: No NUMA node hash function found. Contact maintainer\n");
431 bad_srat();
432 return -1;
433 }
434
435 for (i = 0; i < num_node_memblks; i++)
436 memblock_x86_register_active_regions(memblk_nodeid[i],
437 node_memblk_range[i].start >> PAGE_SHIFT,
438 node_memblk_range[i].end >> PAGE_SHIFT);
439
440 /* for out of order entries in SRAT */
441 sort_node_map();
442 if (!nodes_cover_memory(nodes)) {
443 bad_srat();
444 return -1;
445 }
446 241
447 /* Account for nodes with cpus and no memory */ 242 ret = acpi_numa_init();
448 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 243 if (ret < 0)
449 244 return ret;
450 /* Finally register nodes */ 245 return srat_disabled() ? -EINVAL : 0;
451 for_each_node_mask(i, node_possible_map)
452 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
453 /* Try again in case setup_node_bootmem missed one due
454 to missing bootmem */
455 for_each_node_mask(i, node_possible_map)
456 if (!node_online(i))
457 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
458
459 for (i = 0; i < nr_cpu_ids; i++) {
460 int node = early_cpu_to_node(i);
461
462 if (node == NUMA_NO_NODE)
463 continue;
464 if (!node_online(node))
465 numa_clear_node(i);
466 }
467 numa_init_array();
468 return 0;
469}
470
471#ifdef CONFIG_NUMA_EMU
472static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
473 [0 ... MAX_NUMNODES-1] = PXM_INVAL
474};
475static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
476 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
477};
478static int __init find_node_by_addr(unsigned long addr)
479{
480 int ret = NUMA_NO_NODE;
481 int i;
482
483 for_each_node_mask(i, nodes_parsed) {
484 /*
485 * Find the real node that this emulated node appears on. For
486 * the sake of simplicity, we only use a real node's starting
487 * address to determine which emulated node it appears on.
488 */
489 if (addr >= nodes[i].start && addr < nodes[i].end) {
490 ret = i;
491 break;
492 }
493 }
494 return ret;
495} 246}
496 247
497/*
498 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
499 * mappings that respect the real ACPI topology but reflect our emulated
500 * environment. For each emulated node, we find which real node it appears on
501 * and create PXM to NID mappings for those fake nodes which mirror that
502 * locality. SLIT will now represent the correct distances between emulated
503 * nodes as a result of the real topology.
504 */
505void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
506{
507 int i, j;
508
509 for (i = 0; i < num_nodes; i++) {
510 int nid, pxm;
511
512 nid = find_node_by_addr(fake_nodes[i].start);
513 if (nid == NUMA_NO_NODE)
514 continue;
515 pxm = node_to_pxm(nid);
516 if (pxm == PXM_INVAL)
517 continue;
518 fake_node_to_pxm_map[i] = pxm;
519 /*
520 * For each apicid_to_node mapping that exists for this real
521 * node, it must now point to the fake node ID.
522 */
523 for (j = 0; j < MAX_LOCAL_APIC; j++)
524 if (apicid_to_node[j] == nid &&
525 fake_apicid_to_node[j] == NUMA_NO_NODE)
526 fake_apicid_to_node[j] = i;
527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
539 for (i = 0; i < num_nodes; i++)
540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
542
543 nodes_clear(nodes_parsed);
544 for (i = 0; i < num_nodes; i++)
545 if (fake_nodes[i].start != fake_nodes[i].end)
546 node_set(i, nodes_parsed);
547}
548
549static int null_slit_node_compare(int a, int b)
550{
551 return node_to_pxm(a) == node_to_pxm(b);
552}
553#else
554static int null_slit_node_compare(int a, int b)
555{
556 return a == b;
557}
558#endif /* CONFIG_NUMA_EMU */
559
560int __node_distance(int a, int b)
561{
562 int index;
563
564 if (!acpi_slit)
565 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
566 REMOTE_DISTANCE;
567 index = acpi_slit->locality_count * node_to_pxm(a);
568 return acpi_slit->entry[index + node_to_pxm(b)];
569}
570
571EXPORT_SYMBOL(__node_distance);
572
573#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) 248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
574int memory_add_physaddr_to_nid(u64 start) 249int memory_add_physaddr_to_nid(u64 start)
575{ 250{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
179 sender = this_cpu_read(tlb_vector_offset); 179 sender = this_cpu_read(tlb_vector_offset);
180 f = &flush_state[sender]; 180 f = &flush_state[sender];
181 181
182 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
184 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
185 * probably not worth checking this for a cache-hot lock.
186 */
187 raw_spin_lock(&f->tlbstate_lock);
188 184
189 f->flush_mm = mm; 185 f->flush_mm = mm;
190 f->flush_va = va; 186 f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
202 198
203 f->flush_mm = NULL; 199 f->flush_mm = NULL;
204 f->flush_va = 0; 200 f->flush_va = 0;
205 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
206} 203}
207 204
208void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
211 if (is_uv_system()) { 208 if (is_uv_system()) {
212 unsigned int cpu; 209 unsigned int cpu;
213 210
214 cpu = get_cpu(); 211 cpu = smp_processor_id();
215 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
216 if (cpumask) 213 if (cpumask)
217 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
218 put_cpu();
219 return; 215 return;
220 } 216 }
221 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 72cbec14d783..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 129 dump_trace(NULL, regs, (unsigned long *)stack, 0,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index e2b7b0c06cdf..cf9750004a08 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -15,7 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/oprofile.h> 17#include <linux/oprofile.h>
18#include <linux/sysdev.h> 18#include <linux/syscore_ops.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/moduleparam.h> 20#include <linux/moduleparam.h>
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
@@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; 49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; 50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
51 val |= (counter_config->unit_mask & 0xFF) << 8; 51 val |= (counter_config->unit_mask & 0xFF) << 8;
52 counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
53 ARCH_PERFMON_EVENTSEL_EDGE |
54 ARCH_PERFMON_EVENTSEL_CMASK);
55 val |= counter_config->extra;
52 event &= model->event_mask ? model->event_mask : 0xFF; 56 event &= model->event_mask ? model->event_mask : 0xFF;
53 val |= event & 0xFF; 57 val |= event & 0xFF;
54 val |= (event & 0x0F00) << 24; 58 val |= (event & 0x0F00) << 24;
@@ -440,6 +444,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
440 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 444 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
441 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 445 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
442 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 446 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
447 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
443 } 448 }
444 449
445 return 0; 450 return 0;
@@ -536,7 +541,7 @@ static void nmi_shutdown(void)
536 541
537#ifdef CONFIG_PM 542#ifdef CONFIG_PM
538 543
539static int nmi_suspend(struct sys_device *dev, pm_message_t state) 544static int nmi_suspend(void)
540{ 545{
541 /* Only one CPU left, just stop that one */ 546 /* Only one CPU left, just stop that one */
542 if (nmi_enabled == 1) 547 if (nmi_enabled == 1)
@@ -544,49 +549,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
544 return 0; 549 return 0;
545} 550}
546 551
547static int nmi_resume(struct sys_device *dev) 552static void nmi_resume(void)
548{ 553{
549 if (nmi_enabled == 1) 554 if (nmi_enabled == 1)
550 nmi_cpu_start(NULL); 555 nmi_cpu_start(NULL);
551 return 0;
552} 556}
553 557
554static struct sysdev_class oprofile_sysclass = { 558static struct syscore_ops oprofile_syscore_ops = {
555 .name = "oprofile",
556 .resume = nmi_resume, 559 .resume = nmi_resume,
557 .suspend = nmi_suspend, 560 .suspend = nmi_suspend,
558}; 561};
559 562
560static struct sys_device device_oprofile = { 563static void __init init_suspend_resume(void)
561 .id = 0,
562 .cls = &oprofile_sysclass,
563};
564
565static int __init init_sysfs(void)
566{ 564{
567 int error; 565 register_syscore_ops(&oprofile_syscore_ops);
568
569 error = sysdev_class_register(&oprofile_sysclass);
570 if (error)
571 return error;
572
573 error = sysdev_register(&device_oprofile);
574 if (error)
575 sysdev_class_unregister(&oprofile_sysclass);
576
577 return error;
578} 566}
579 567
580static void exit_sysfs(void) 568static void exit_suspend_resume(void)
581{ 569{
582 sysdev_unregister(&device_oprofile); 570 unregister_syscore_ops(&oprofile_syscore_ops);
583 sysdev_class_unregister(&oprofile_sysclass);
584} 571}
585 572
586#else 573#else
587 574
588static inline int init_sysfs(void) { return 0; } 575static inline void init_suspend_resume(void) { }
589static inline void exit_sysfs(void) { } 576static inline void exit_suspend_resume(void) { }
590 577
591#endif /* CONFIG_PM */ 578#endif /* CONFIG_PM */
592 579
@@ -789,9 +776,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
789 776
790 mux_init(ops); 777 mux_init(ops);
791 778
792 ret = init_sysfs(); 779 init_suspend_resume();
793 if (ret)
794 return ret;
795 780
796 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 781 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
797 return 0; 782 return 0;
@@ -799,5 +784,5 @@ int __init op_nmi_init(struct oprofile_operations *ops)
799 784
800void op_nmi_exit(void) 785void op_nmi_exit(void)
801{ 786{
802 exit_sysfs(); 787 exit_suspend_resume();
803} 788}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index e28398df0df2..0b7b7b179cbe 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -22,6 +22,7 @@ struct op_counter_config {
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25 unsigned long extra;
25}; 26};
26 27
27extern struct op_counter_config counter_config[]; 28extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 9fadec074142..98ab13058f89 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -50,7 +50,7 @@ static inline void setup_num_counters(void)
50#endif 50#endif
51} 51}
52 52
53static int inline addr_increment(void) 53static inline int addr_increment(void)
54{ 54{
55#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
56 return smp_num_siblings == 2 ? 2 : 1; 56 return smp_num_siblings == 2 ? 2 : 1;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e27dffbbb1a7..026e4931d162 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void)
350 350
351#define ENABLE_CF8_EXT_CFG (1ULL << 46) 351#define ENABLE_CF8_EXT_CFG (1ULL << 46)
352 352
353static void enable_pci_io_ecs(void *unused) 353static void __cpuinit enable_pci_io_ecs(void *unused)
354{ 354{
355 u64 reg; 355 u64 reg;
356 rdmsrl(MSR_AMD64_NB_CFG, reg); 356 rdmsrl(MSR_AMD64_NB_CFG, reg);
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 85b68ef5e809..67858be4b52b 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/init.h> 35#include <linux/init.h>
36 36
37#include <asm/ce4100.h>
37#include <asm/pci_x86.h> 38#include <asm/pci_x86.h>
38 39
39struct sim_reg { 40struct sim_reg {
@@ -254,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
254static int ce4100_conf_read(unsigned int seg, unsigned int bus, 255static int ce4100_conf_read(unsigned int seg, unsigned int bus,
255 unsigned int devfn, int reg, int len, u32 *value) 256 unsigned int devfn, int reg, int len, u32 *value)
256{ 257{
257 int i, retval = 1; 258 int i;
258 259
259 if (bus == 1) { 260 if (bus == 1) {
260 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
@@ -306,10 +307,10 @@ struct pci_raw_ops ce4100_pci_conf = {
306 .write = ce4100_conf_write, 307 .write = ce4100_conf_write,
307}; 308};
308 309
309static int __init ce4100_pci_init(void) 310int __init ce4100_pci_init(void)
310{ 311{
311 init_sim_regs(); 312 init_sim_regs();
312 raw_pci_ops = &ce4100_pci_conf; 313 raw_pci_ops = &ce4100_pci_conf;
313 return 0; 314 /* Indicate caller that it should invoke pci_legacy_init() */
315 return 1;
314} 316}
315subsys_initcall(ce4100_pci_init);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b1805b78842f..494f2e7ea2b4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void)
241 e820_reserve_resources_late(); 241 e820_reserve_resources_late();
242 /* 242 /*
243 * Insert the IO APIC resources after PCI initialization has 243 * Insert the IO APIC resources after PCI initialization has
244 * occured to handle IO APICS that are mapped in on a BAR in 244 * occurred to handle IO APICS that are mapped in on a BAR in
245 * PCI space, but before trying to assign unassigned pci res. 245 * PCI space, but before trying to assign unassigned pci res.
246 */ 246 */
247 ioapic_insert_resources(); 247 ioapic_insert_resources();
@@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
304 /* 304 /*
305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
306 * To avoid attribute conflicts, request UC MINUS here 306 * To avoid attribute conflicts, request UC MINUS here
307 * aswell. 307 * as well.
308 */ 308 */
309 prot |= _PAGE_CACHE_UC_MINUS; 309 prot |= _PAGE_CACHE_UC_MINUS;
310 310
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 87e6c8323117..8201165bae28 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
597 return 1; 597 return 1;
598 } 598 }
599 599
600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && 600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
601 (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { 601 device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
602 r->name = "PIIX/ICH"; 606 r->name = "PIIX/ICH";
603 r->get = pirq_piix_get; 607 r->get = pirq_piix_get;
604 r->set = pirq_piix_set; 608 r->set = pirq_piix_set;
605 return 1; 609 return 1;
606 } 610 }
607 611
608 if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
609 (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
610 r->name = "PIIX/ICH";
611 r->get = pirq_piix_get;
612 r->set = pirq_piix_set;
613 return 1;
614 }
615 return 0; 612 return 0;
616} 613}
617 614
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a07d09f..e37b407a0ee8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
20#include <asm/xen/pci.h> 20#include <asm/xen/pci.h>
21 21
22#ifdef CONFIG_ACPI 22#ifdef CONFIG_ACPI
23static int xen_hvm_register_pirq(u32 gsi, int triggering) 23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
24 int trigger, int polarity)
24{ 25{
25 int rc, irq; 26 int rc, irq;
26 struct physdev_map_pirq map_irq; 27 struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
41 return -1; 42 return -1;
42 } 43 }
43 44
44 if (triggering == ACPI_EDGE_SENSITIVE) { 45 if (trigger == ACPI_EDGE_SENSITIVE) {
45 shareable = 0; 46 shareable = 0;
46 name = "ioapic-edge"; 47 name = "ioapic-edge";
47 } else { 48 } else {
@@ -49,18 +50,12 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
49 name = "ioapic-level"; 50 name = "ioapic-level";
50 } 51 }
51 52
52 irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); 53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
53 54
54 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
55 56
56 return irq; 57 return irq;
57} 58}
58
59static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
60 int trigger, int polarity)
61{
62 return xen_hvm_register_pirq(gsi, trigger);
63}
64#endif 59#endif
65 60
66#if defined(CONFIG_PCI_MSI) 61#if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
91 86
92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 87static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
93{ 88{
94 int irq, pirq, ret = 0; 89 int irq, pirq;
95 struct msi_desc *msidesc; 90 struct msi_desc *msidesc;
96 struct msi_msg msg; 91 struct msi_msg msg;
97 92
@@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
99 __read_msi_msg(msidesc, &msg); 94 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | 95 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); 96 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { 97 if (msg.data != XEN_PIRQ_MSI_DATA ||
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 98 xen_irq_from_pirq(pirq) < 0) {
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); 99 pirq = xen_allocate_pirq_msi(dev, msidesc);
105 if (irq < 0) 100 if (pirq < 0)
106 goto error; 101 goto error;
107 ret = set_irq_msi(irq, msidesc); 102 xen_msi_compose_msg(dev, pirq, &msg);
108 if (ret < 0) 103 __write_msi_msg(msidesc, &msg);
109 goto error_while; 104 dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" 105 } else {
111 " pirq=%d\n", irq, pirq); 106 dev_dbg(&dev->dev,
112 return 0; 107 "xen: msi already bound to pirq=%d\n", pirq);
113 } 108 }
114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); 110 (type == PCI_CAP_ID_MSIX) ?
116 if (irq < 0 || pirq < 0) 111 "msi-x" : "msi");
112 if (irq < 0)
117 goto error; 113 goto error;
118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 114 dev_dbg(&dev->dev,
119 xen_msi_compose_msg(dev, pirq, &msg); 115 "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
120 ret = set_irq_msi(irq, msidesc);
121 if (ret < 0)
122 goto error_while;
123 write_msi_msg(irq, &msg);
124 } 116 }
125 return 0; 117 return 0;
126 118
127error_while:
128 unbind_from_irqhandler(irq, NULL);
129error: 119error:
130 if (ret == -ENODEV) 120 dev_err(&dev->dev,
131 dev_err(&dev->dev, "Xen PCI frontend has not registered" \ 121 "Xen PCI frontend has not registered MSI/MSI-X support!\n");
132 " MSI/MSI-X support!\n"); 122 return -ENODEV;
133
134 return ret;
135} 123}
136 124
137/* 125/*
@@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
150 return -ENOMEM; 138 return -ENOMEM;
151 139
152 if (type == PCI_CAP_ID_MSIX) 140 if (type == PCI_CAP_ID_MSIX)
153 ret = xen_pci_frontend_enable_msix(dev, &v, nvec); 141 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
154 else 142 else
155 ret = xen_pci_frontend_enable_msi(dev, &v); 143 ret = xen_pci_frontend_enable_msi(dev, v);
156 if (ret) 144 if (ret)
157 goto error; 145 goto error;
158 i = 0; 146 i = 0;
159 list_for_each_entry(msidesc, &dev->msi_list, list) { 147 list_for_each_entry(msidesc, &dev->msi_list, list) {
160 irq = xen_allocate_pirq(v[i], 0, /* not sharable */ 148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
161 (type == PCI_CAP_ID_MSIX) ? 149 (type == PCI_CAP_ID_MSIX) ?
162 "pcifront-msi-x" : "pcifront-msi"); 150 "pcifront-msi-x" :
163 if (irq < 0) { 151 "pcifront-msi");
164 ret = -1; 152 if (irq < 0)
165 goto free; 153 goto free;
166 }
167
168 ret = set_irq_msi(irq, msidesc);
169 if (ret)
170 goto error_while;
171 i++; 154 i++;
172 } 155 }
173 kfree(v); 156 kfree(v);
174 return 0; 157 return 0;
175 158
176error_while:
177 unbind_from_irqhandler(irq, NULL);
178error: 159error:
179 if (ret == -ENODEV) 160 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
180 dev_err(&dev->dev, "Xen PCI frontend has not registered" \
181 " MSI/MSI-X support!\n");
182free: 161free:
183 kfree(v); 162 kfree(v);
184 return ret; 163 return ret;
@@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
193 xen_pci_frontend_disable_msix(dev); 172 xen_pci_frontend_disable_msix(dev);
194 else 173 else
195 xen_pci_frontend_disable_msi(dev); 174 xen_pci_frontend_disable_msi(dev);
175
176 /* Free the IRQ's and the msidesc using the generic code. */
177 default_teardown_msi_irqs(dev);
196} 178}
197 179
198static void xen_teardown_msi_irq(unsigned int irq) 180static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +182,91 @@ static void xen_teardown_msi_irq(unsigned int irq)
200 xen_destroy_irq(irq); 182 xen_destroy_irq(irq);
201} 183}
202 184
185#ifdef CONFIG_XEN_DOM0
203static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 186static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
204{ 187{
205 int irq, ret; 188 int ret = 0;
206 struct msi_desc *msidesc; 189 struct msi_desc *msidesc;
207 190
208 list_for_each_entry(msidesc, &dev->msi_list, list) { 191 list_for_each_entry(msidesc, &dev->msi_list, list) {
209 irq = xen_create_msi_irq(dev, msidesc, type); 192 struct physdev_map_pirq map_irq;
210 if (irq < 0)
211 return -1;
212 193
213 ret = set_irq_msi(irq, msidesc); 194 memset(&map_irq, 0, sizeof(map_irq));
214 if (ret) 195 map_irq.domid = DOMID_SELF;
215 goto error; 196 map_irq.type = MAP_PIRQ_TYPE_MSI;
216 } 197 map_irq.index = -1;
217 return 0; 198 map_irq.pirq = -1;
199 map_irq.bus = dev->bus->number;
200 map_irq.devfn = dev->devfn;
218 201
219error: 202 if (type == PCI_CAP_ID_MSIX) {
220 xen_destroy_irq(irq); 203 int pos;
204 u32 table_offset, bir;
205
206 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
207
208 pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
209 &table_offset);
210 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
211
212 map_irq.table_base = pci_resource_start(dev, bir);
213 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
214 }
215
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
219 goto out;
220 }
221
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi");
226 if (ret < 0)
227 goto out;
228 }
229 ret = 0;
230out:
221 return ret; 231 return ret;
222} 232}
223#endif 233#endif
234#endif
224 235
225static int xen_pcifront_enable_irq(struct pci_dev *dev) 236static int xen_pcifront_enable_irq(struct pci_dev *dev)
226{ 237{
227 int rc; 238 int rc;
228 int share = 1; 239 int share = 1;
240 int pirq;
241 u8 gsi;
229 242
230 dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); 243 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
244 if (rc < 0) {
245 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
246 rc);
247 return rc;
248 }
231 249
232 if (dev->irq < 0) 250 rc = xen_allocate_pirq_gsi(gsi);
233 return -EINVAL; 251 if (rc < 0) {
252 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
253 gsi, rc);
254 return rc;
255 }
256 pirq = rc;
234 257
235 if (dev->irq < NR_IRQS_LEGACY) 258 if (gsi < NR_IRQS_LEGACY)
236 share = 0; 259 share = 0;
237 260
238 rc = xen_allocate_pirq(dev->irq, share, "pcifront"); 261 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
239 if (rc < 0) { 262 if (rc < 0) {
240 dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", 263 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
241 dev->irq, rc); 264 gsi, pirq, rc);
242 return rc; 265 return rc;
243 } 266 }
267
268 dev->irq = rc;
269 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
244 return 0; 270 return 0;
245} 271}
246 272
@@ -292,7 +318,7 @@ int __init pci_xen_hvm_init(void)
292#ifdef CONFIG_XEN_DOM0 318#ifdef CONFIG_XEN_DOM0
293static int xen_register_pirq(u32 gsi, int triggering) 319static int xen_register_pirq(u32 gsi, int triggering)
294{ 320{
295 int rc, irq; 321 int rc, pirq, irq = -1;
296 struct physdev_map_pirq map_irq; 322 struct physdev_map_pirq map_irq;
297 int shareable = 0; 323 int shareable = 0;
298 char *name; 324 char *name;
@@ -308,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering)
308 name = "ioapic-level"; 334 name = "ioapic-level";
309 } 335 }
310 336
311 irq = xen_allocate_pirq(gsi, shareable, name); 337 pirq = xen_allocate_pirq_gsi(gsi);
312 338 if (pirq < 0)
313 printk(KERN_DEBUG "xen: --> irq=%d\n", irq); 339 goto out;
314 340
341 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
315 if (irq < 0) 342 if (irq < 0)
316 goto out; 343 goto out;
317 344
345 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
346
318 map_irq.domid = DOMID_SELF; 347 map_irq.domid = DOMID_SELF;
319 map_irq.type = MAP_PIRQ_TYPE_GSI; 348 map_irq.type = MAP_PIRQ_TYPE_GSI;
320 map_irq.index = gsi; 349 map_irq.index = gsi;
321 map_irq.pirq = irq; 350 map_irq.pirq = pirq;
322 351
323 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 352 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
324 if (rc) { 353 if (rc) {
@@ -405,13 +434,18 @@ static int __init pci_xen_initial_domain(void)
405 434
406void __init xen_setup_pirqs(void) 435void __init xen_setup_pirqs(void)
407{ 436{
408 int irq; 437 int pirq, irq;
409 438
410 pci_xen_initial_domain(); 439 pci_xen_initial_domain();
411 440
412 if (0 == nr_ioapics) { 441 if (0 == nr_ioapics) {
413 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) 442 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
414 xen_allocate_pirq(irq, 0, "xt-pic"); 443 pirq = xen_allocate_pirq_gsi(irq);
444 if (WARN(pirq < 0,
445 "Could not allocate PIRQ for legacy interrupt\n"))
446 break;
447 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
448 }
415 return; 449 return;
416 } 450 }
417 451
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index d2c0d51a7178..28071bb31db7 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -15,21 +15,20 @@
15#include <linux/serial_reg.h> 15#include <linux/serial_reg.h>
16#include <linux/serial_8250.h> 16#include <linux/serial_8250.h>
17 17
18#include <asm/ce4100.h>
19#include <asm/prom.h>
18#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/i8259.h>
19#include <asm/io.h> 22#include <asm/io.h>
23#include <asm/io_apic.h>
20 24
21static int ce4100_i8042_detect(void) 25static int ce4100_i8042_detect(void)
22{ 26{
23 return 0; 27 return 0;
24} 28}
25 29
26static void __init sdv_find_smp_config(void)
27{
28}
29
30#ifdef CONFIG_SERIAL_8250 30#ifdef CONFIG_SERIAL_8250
31 31
32
33static unsigned int mem_serial_in(struct uart_port *p, int offset) 32static unsigned int mem_serial_in(struct uart_port *p, int offset)
34{ 33{
35 offset = offset << p->regshift; 34 offset = offset << p->regshift;
@@ -118,6 +117,15 @@ static void __init sdv_arch_setup(void)
118 sdv_serial_fixup(); 117 sdv_serial_fixup();
119} 118}
120 119
120#ifdef CONFIG_X86_IO_APIC
121static void __cpuinit sdv_pci_init(void)
122{
123 x86_of_pci_init();
124 /* We can't set this earlier, because we need to calibrate the timer */
125 legacy_pic = &null_legacy_pic;
126}
127#endif
128
121/* 129/*
122 * CE4100 specific x86_init function overrides and early setup 130 * CE4100 specific x86_init function overrides and early setup
123 * calls. 131 * calls.
@@ -128,5 +136,11 @@ void __init x86_ce4100_early_setup(void)
128 x86_platform.i8042_detect = ce4100_i8042_detect; 136 x86_platform.i8042_detect = ce4100_i8042_detect;
129 x86_init.resources.probe_roms = x86_init_noop; 137 x86_init.resources.probe_roms = x86_init_noop;
130 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 138 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
131 x86_init.mpparse.find_smp_config = sdv_find_smp_config; 139 x86_init.mpparse.find_smp_config = x86_init_noop;
140 x86_init.pci.init = ce4100_pci_init;
141
142#ifdef CONFIG_X86_IO_APIC
143 x86_init.pci.init_irq = sdv_pci_init;
144 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
145#endif
132} 146}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 000000000000..e70be38ce039
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,430 @@
1/*
2 * CE4100 on Falcon Falls
3 *
4 * (c) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; version 2 of the License.
9 */
10/dts-v1/;
11/ {
12 model = "intel,falconfalls";
13 compatible = "intel,falconfalls";
14 #address-cells = <1>;
15 #size-cells = <1>;
16
17 cpus {
18 #address-cells = <1>;
19 #size-cells = <0>;
20
21 cpu@0 {
22 device_type = "cpu";
23 compatible = "intel,ce4100";
24 reg = <0>;
25 lapic = <&lapic0>;
26 };
27 };
28
29 soc@0 {
30 #address-cells = <1>;
31 #size-cells = <1>;
32 compatible = "intel,ce4100-cp";
33 ranges;
34
35 ioapic1: interrupt-controller@fec00000 {
36 #interrupt-cells = <2>;
37 compatible = "intel,ce4100-ioapic";
38 interrupt-controller;
39 reg = <0xfec00000 0x1000>;
40 };
41
42 timer@fed00000 {
43 compatible = "intel,ce4100-hpet";
44 reg = <0xfed00000 0x200>;
45 };
46
47 lapic0: interrupt-controller@fee00000 {
48 compatible = "intel,ce4100-lapic";
49 reg = <0xfee00000 0x1000>;
50 };
51
52 pci@3fc {
53 #address-cells = <3>;
54 #size-cells = <2>;
55 compatible = "intel,ce4100-pci", "pci";
56 device_type = "pci";
57 bus-range = <0 0>;
58 ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
59 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
60 0x0000000 0 0x0 0x0 0 0x100>;
61
62 /* Secondary IO-APIC */
63 ioapic2: interrupt-controller@0,1 {
64 #interrupt-cells = <2>;
65 compatible = "intel,ce4100-ioapic";
66 interrupt-controller;
67 reg = <0x100 0x0 0x0 0x0 0x0>;
68 assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
69 };
70
71 pci@1,0 {
72 #address-cells = <3>;
73 #size-cells = <2>;
74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci";
76 bus-range = <1 1>;
77 reg = <0x0800 0x0 0x0 0x0 0x0>;
78 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
79
80 interrupt-parent = <&ioapic2>;
81
82 display@2,0 {
83 compatible = "pci8086,2e5b.2",
84 "pci8086,2e5b",
85 "pciclass038000",
86 "pciclass0380";
87
88 reg = <0x11000 0x0 0x0 0x0 0x0>;
89 interrupts = <0 1>;
90 };
91
92 multimedia@3,0 {
93 compatible = "pci8086,2e5c.2",
94 "pci8086,2e5c",
95 "pciclass048000",
96 "pciclass0480";
97
98 reg = <0x11800 0x0 0x0 0x0 0x0>;
99 interrupts = <2 1>;
100 };
101
102 multimedia@4,0 {
103 compatible = "pci8086,2e5d.2",
104 "pci8086,2e5d",
105 "pciclass048000",
106 "pciclass0480";
107
108 reg = <0x12000 0x0 0x0 0x0 0x0>;
109 interrupts = <4 1>;
110 };
111
112 multimedia@4,1 {
113 compatible = "pci8086,2e5e.2",
114 "pci8086,2e5e",
115 "pciclass048000",
116 "pciclass0480";
117
118 reg = <0x12100 0x0 0x0 0x0 0x0>;
119 interrupts = <5 1>;
120 };
121
122 sound@6,0 {
123 compatible = "pci8086,2e5f.2",
124 "pci8086,2e5f",
125 "pciclass040100",
126 "pciclass0401";
127
128 reg = <0x13000 0x0 0x0 0x0 0x0>;
129 interrupts = <6 1>;
130 };
131
132 sound@6,1 {
133 compatible = "pci8086,2e5f.2",
134 "pci8086,2e5f",
135 "pciclass040100",
136 "pciclass0401";
137
138 reg = <0x13100 0x0 0x0 0x0 0x0>;
139 interrupts = <7 1>;
140 };
141
142 sound@6,2 {
143 compatible = "pci8086,2e60.2",
144 "pci8086,2e60",
145 "pciclass040100",
146 "pciclass0401";
147
148 reg = <0x13200 0x0 0x0 0x0 0x0>;
149 interrupts = <8 1>;
150 };
151
152 display@8,0 {
153 compatible = "pci8086,2e61.2",
154 "pci8086,2e61",
155 "pciclass038000",
156 "pciclass0380";
157
158 reg = <0x14000 0x0 0x0 0x0 0x0>;
159 interrupts = <9 1>;
160 };
161
162 display@8,1 {
163 compatible = "pci8086,2e62.2",
164 "pci8086,2e62",
165 "pciclass038000",
166 "pciclass0380";
167
168 reg = <0x14100 0x0 0x0 0x0 0x0>;
169 interrupts = <10 1>;
170 };
171
172 multimedia@8,2 {
173 compatible = "pci8086,2e63.2",
174 "pci8086,2e63",
175 "pciclass048000",
176 "pciclass0480";
177
178 reg = <0x14200 0x0 0x0 0x0 0x0>;
179 interrupts = <11 1>;
180 };
181
182 entertainment-encryption@9,0 {
183 compatible = "pci8086,2e64.2",
184 "pci8086,2e64",
185 "pciclass101000",
186 "pciclass1010";
187
188 reg = <0x14800 0x0 0x0 0x0 0x0>;
189 interrupts = <12 1>;
190 };
191
192 localbus@a,0 {
193 compatible = "pci8086,2e65.2",
194 "pci8086,2e65",
195 "pciclassff0000",
196 "pciclassff00";
197
198 reg = <0x15000 0x0 0x0 0x0 0x0>;
199 };
200
201 serial@b,0 {
202 compatible = "pci8086,2e66.2",
203 "pci8086,2e66",
204 "pciclass070003",
205 "pciclass0700";
206
207 reg = <0x15800 0x0 0x0 0x0 0x0>;
208 interrupts = <14 1>;
209 };
210
211 gpio@b,1 {
212 compatible = "pci8086,2e67.2",
213 "pci8086,2e67",
214 "pciclassff0000",
215 "pciclassff00";
216
217 #gpio-cells = <2>;
218 reg = <0x15900 0x0 0x0 0x0 0x0>;
219 interrupts = <15 1>;
220 gpio-controller;
221 };
222
223 i2c-controller@b,2 {
224 #address-cells = <2>;
225 #size-cells = <1>;
226 compatible = "pci8086,2e68.2",
227 "pci8086,2e68",
228 "pciclass,ff0000",
229 "pciclass,ff00";
230
231 reg = <0x15a00 0x0 0x0 0x0 0x0>;
232 interrupts = <16 1>;
233 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
234 1 0 0x02000000 0 0xdffe0600 0x100
235 2 0 0x02000000 0 0xdffe0700 0x100>;
236
237 i2c@0 {
238 #address-cells = <1>;
239 #size-cells = <0>;
240 compatible = "intel,ce4100-i2c-controller";
241 reg = <0 0 0x100>;
242 };
243
244 i2c@1 {
245 #address-cells = <1>;
246 #size-cells = <0>;
247 compatible = "intel,ce4100-i2c-controller";
248 reg = <1 0 0x100>;
249
250 gpio@26 {
251 #gpio-cells = <2>;
252 compatible = "ti,pcf8575";
253 reg = <0x26>;
254 gpio-controller;
255 };
256 };
257
258 i2c@2 {
259 #address-cells = <1>;
260 #size-cells = <0>;
261 compatible = "intel,ce4100-i2c-controller";
262 reg = <2 0 0x100>;
263
264 gpio@26 {
265 #gpio-cells = <2>;
266 compatible = "ti,pcf8575";
267 reg = <0x26>;
268 gpio-controller;
269 };
270 };
271 };
272
273 smard-card@b,3 {
274 compatible = "pci8086,2e69.2",
275 "pci8086,2e69",
276 "pciclass070500",
277 "pciclass0705";
278
279 reg = <0x15b00 0x0 0x0 0x0 0x0>;
280 interrupts = <15 1>;
281 };
282
283 spi-controller@b,4 {
284 #address-cells = <1>;
285 #size-cells = <0>;
286 compatible =
287 "pci8086,2e6a.2",
288 "pci8086,2e6a",
289 "pciclass,ff0000",
290 "pciclass,ff00";
291
292 reg = <0x15c00 0x0 0x0 0x0 0x0>;
293 interrupts = <15 1>;
294
295 dac@0 {
296 compatible = "ti,pcm1755";
297 reg = <0>;
298 spi-max-frequency = <115200>;
299 };
300
301 dac@1 {
302 compatible = "ti,pcm1609a";
303 reg = <1>;
304 spi-max-frequency = <115200>;
305 };
306
307 eeprom@2 {
308 compatible = "atmel,at93c46";
309 reg = <2>;
310 spi-max-frequency = <115200>;
311 };
312 };
313
314 multimedia@b,7 {
315 compatible = "pci8086,2e6d.2",
316 "pci8086,2e6d",
317 "pciclassff0000",
318 "pciclassff00";
319
320 reg = <0x15f00 0x0 0x0 0x0 0x0>;
321 };
322
323 ethernet@c,0 {
324 compatible = "pci8086,2e6e.2",
325 "pci8086,2e6e",
326 "pciclass020000",
327 "pciclass0200";
328
329 reg = <0x16000 0x0 0x0 0x0 0x0>;
330 interrupts = <21 1>;
331 };
332
333 clock@c,1 {
334 compatible = "pci8086,2e6f.2",
335 "pci8086,2e6f",
336 "pciclassff0000",
337 "pciclassff00";
338
339 reg = <0x16100 0x0 0x0 0x0 0x0>;
340 interrupts = <3 1>;
341 };
342
343 usb@d,0 {
344 compatible = "pci8086,2e70.2",
345 "pci8086,2e70",
346 "pciclass0c0320",
347 "pciclass0c03";
348
349 reg = <0x16800 0x0 0x0 0x0 0x0>;
350 interrupts = <22 1>;
351 };
352
353 usb@d,1 {
354 compatible = "pci8086,2e70.2",
355 "pci8086,2e70",
356 "pciclass0c0320",
357 "pciclass0c03";
358
359 reg = <0x16900 0x0 0x0 0x0 0x0>;
360 interrupts = <22 1>;
361 };
362
363 sata@e,0 {
364 compatible = "pci8086,2e71.0",
365 "pci8086,2e71",
366 "pciclass010601",
367 "pciclass0106";
368
369 reg = <0x17000 0x0 0x0 0x0 0x0>;
370 interrupts = <23 1>;
371 };
372
373 flash@f,0 {
374 compatible = "pci8086,701.1",
375 "pci8086,701",
376 "pciclass050100",
377 "pciclass0501";
378
379 reg = <0x17800 0x0 0x0 0x0 0x0>;
380 interrupts = <13 1>;
381 };
382
383 entertainment-encryption@10,0 {
384 compatible = "pci8086,702.1",
385 "pci8086,702",
386 "pciclass101000",
387 "pciclass1010";
388
389 reg = <0x18000 0x0 0x0 0x0 0x0>;
390 };
391
392 co-processor@11,0 {
393 compatible = "pci8086,703.1",
394 "pci8086,703",
395 "pciclass0b4000",
396 "pciclass0b40";
397
398 reg = <0x18800 0x0 0x0 0x0 0x0>;
399 interrupts = <1 1>;
400 };
401
402 multimedia@12,0 {
403 compatible = "pci8086,704.0",
404 "pci8086,704",
405 "pciclass048000",
406 "pciclass0480";
407
408 reg = <0x19000 0x0 0x0 0x0 0x0>;
409 };
410 };
411
412 isa@1f,0 {
413 #address-cells = <2>;
414 #size-cells = <1>;
415 compatible = "isa";
416 reg = <0xf800 0x0 0x0 0x0 0x0>;
417 ranges = <1 0 0 0 0 0x100>;
418
419 rtc@70 {
420 compatible = "intel,ce4100-rtc", "motorola,mc146818";
421 interrupts = <8 3>;
422 interrupt-parent = <&ioapic1>;
423 ctrl-reg = <2>;
424 freq-reg = <0x26>;
425 reg = <1 0x70 2>;
426 };
427 };
428 };
429 };
430};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index ea6529e93c6f..275dbc19e2cf 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -31,6 +31,7 @@
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/io_apic.h> 32#include <asm/io_apic.h>
33#include <asm/mrst.h> 33#include <asm/mrst.h>
34#include <asm/mrst-vrtc.h>
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/i8259.h> 36#include <asm/i8259.h>
36#include <asm/intel_scu_ipc.h> 37#include <asm/intel_scu_ipc.h>
@@ -96,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
96 pentry->freq_hz, pentry->irq); 97 pentry->freq_hz, pentry->irq);
97 if (!pentry->irq) 98 if (!pentry->irq)
98 continue; 99 continue;
99 mp_irq.type = MP_IOAPIC; 100 mp_irq.type = MP_INTSRC;
100 mp_irq.irqtype = mp_INT; 101 mp_irq.irqtype = mp_INT;
101/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ 102/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
102 mp_irq.irqflag = 5; 103 mp_irq.irqflag = 5;
103 mp_irq.srcbus = 0; 104 mp_irq.srcbus = MP_BUS_ISA;
104 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 105 mp_irq.srcbusirq = pentry->irq; /* IRQ */
105 mp_irq.dstapic = MP_APIC_ALL; 106 mp_irq.dstapic = MP_APIC_ALL;
106 mp_irq.dstirq = pentry->irq; 107 mp_irq.dstirq = pentry->irq;
@@ -167,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { 168 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
168 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", 169 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
169 totallen, (u32)pentry->phys_addr, pentry->irq); 170 totallen, (u32)pentry->phys_addr, pentry->irq);
170 mp_irq.type = MP_IOAPIC; 171 mp_irq.type = MP_INTSRC;
171 mp_irq.irqtype = mp_INT; 172 mp_irq.irqtype = mp_INT;
172 mp_irq.irqflag = 0xf; /* level trigger and active low */ 173 mp_irq.irqflag = 0xf; /* level trigger and active low */
173 mp_irq.srcbus = 0; 174 mp_irq.srcbus = MP_BUS_ISA;
174 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 175 mp_irq.srcbusirq = pentry->irq; /* IRQ */
175 mp_irq.dstapic = MP_APIC_ALL; 176 mp_irq.dstapic = MP_APIC_ALL;
176 mp_irq.dstirq = pentry->irq; 177 mp_irq.dstirq = pentry->irq;
@@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void)
268 269
269 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 270 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
270 x86_platform.i8042_detect = mrst_i8042_detect; 271 x86_platform.i8042_detect = mrst_i8042_detect;
272 x86_init.timers.wallclock_init = mrst_rtc_init;
271 x86_init.pci.init = pci_mrst_init; 273 x86_init.pci.init = pci_mrst_init;
272 x86_init.pci.fixup_irqs = x86_init_noop; 274 x86_init.pci.fixup_irqs = x86_init_noop;
273 275
@@ -280,7 +282,7 @@ void __init x86_mrst_early_setup(void)
280 /* Avoid searching for BIOS MP tables */ 282 /* Avoid searching for BIOS MP tables */
281 x86_init.mpparse.find_smp_config = x86_init_noop; 283 x86_init.mpparse.find_smp_config = x86_init_noop;
282 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 284 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
283 285 set_bit(MP_BUS_ISA, mp_bus_not_pci);
284} 286}
285 287
286/* 288/*
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 32cd7edd71a0..73d70d65e76e 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,22 +100,16 @@ int vrtc_set_mmss(unsigned long nowtime)
100 100
101void __init mrst_rtc_init(void) 101void __init mrst_rtc_init(void)
102{ 102{
103 unsigned long rtc_paddr; 103 unsigned long vrtc_paddr;
104 void __iomem *virt_base;
105 104
106 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 105 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
107 if (!sfi_mrtc_num)
108 return;
109
110 rtc_paddr = sfi_mrtc_array[0].phys_addr;
111 106
112 /* vRTC's register address may not be page aligned */ 107 vrtc_paddr = sfi_mrtc_array[0].phys_addr;
113 set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); 108 if (!sfi_mrtc_num || !vrtc_paddr)
114 109 return;
115 virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
116 virt_base += rtc_paddr & ~PAGE_MASK;
117 vrtc_virt_base = virt_base;
118 110
111 vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
112 vrtc_paddr);
119 x86_platform.get_wallclock = vrtc_get_time; 113 x86_platform.get_wallclock = vrtc_get_time;
120 x86_platform.set_wallclock = vrtc_set_mmss; 114 x86_platform.set_wallclock = vrtc_set_mmss;
121} 115}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e797428b163b..c2a8cab65e5d 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o 4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 127775696d6c..ab81fb271760 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/platform_device.h> 16#include <linux/platform_device.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/mfd/core.h>
18 19
19#include <asm/io.h> 20#include <asm/io.h>
20#include <asm/olpc.h> 21#include <asm/olpc.h>
@@ -56,25 +57,24 @@ static void xo1_power_off(void)
56static int __devinit olpc_xo1_probe(struct platform_device *pdev) 57static int __devinit olpc_xo1_probe(struct platform_device *pdev)
57{ 58{
58 struct resource *res; 59 struct resource *res;
60 int err;
59 61
60 /* don't run on non-XOs */ 62 /* don't run on non-XOs */
61 if (!machine_is_olpc()) 63 if (!machine_is_olpc())
62 return -ENODEV; 64 return -ENODEV;
63 65
66 err = mfd_cell_enable(pdev);
67 if (err)
68 return err;
69
64 res = platform_get_resource(pdev, IORESOURCE_IO, 0); 70 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
65 if (!res) { 71 if (!res) {
66 dev_err(&pdev->dev, "can't fetch device resource info\n"); 72 dev_err(&pdev->dev, "can't fetch device resource info\n");
67 return -EIO; 73 return -EIO;
68 } 74 }
69
70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
71 dev_err(&pdev->dev, "can't request region\n");
72 return -EIO;
73 }
74
75 if (strcmp(pdev->name, "cs5535-pms") == 0) 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
76 pms_base = res->start; 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 77 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
78 acpi_base = res->start; 78 acpi_base = res->start;
79 79
80 /* If we have both addresses, we can override the poweroff hook */ 80 /* If we have both addresses, we can override the poweroff hook */
@@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev)
88 88
89static int __devexit olpc_xo1_remove(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
90{ 90{
91 struct resource *r; 91 mfd_cell_disable(pdev);
92
93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
95 92
96 if (strcmp(pdev->name, "cs5535-pms") == 0) 93 if (strcmp(pdev->name, "cs5535-pms") == 0)
97 pms_base = 0; 94 pms_base = 0;
98 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 95 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
99 acpi_base = 0; 96 acpi_base = 0;
100 97
101 pm_power_off = NULL; 98 pm_power_off = NULL;
@@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = {
113 110
114static struct platform_driver cs5535_acpi_drv = { 111static struct platform_driver cs5535_acpi_drv = {
115 .driver = { 112 .driver = {
116 .name = "cs5535-acpi", 113 .name = "olpc-xo1-pm-acpi",
117 .owner = THIS_MODULE, 114 .owner = THIS_MODULE,
118 }, 115 },
119 .probe = olpc_xo1_probe, 116 .probe = olpc_xo1_probe,
@@ -143,7 +140,7 @@ static void __exit olpc_xo1_exit(void)
143 140
144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 141MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
145MODULE_LICENSE("GPL"); 142MODULE_LICENSE("GPL");
146MODULE_ALIAS("platform:olpc-xo1"); 143MODULE_ALIAS("platform:cs5535-pms");
147 144
148module_init(olpc_xo1_init); 145module_init(olpc_xo1_init);
149module_exit(olpc_xo1_exit); 146module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index dab874647530..044bda5b3174 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -140,8 +140,7 @@ void * __init prom_early_alloc(unsigned long size)
140 * wasted bootmem) and hand off chunks of it to callers. 140 * wasted bootmem) and hand off chunks of it to callers.
141 */ 141 */
142 res = alloc_bootmem(chunk_size); 142 res = alloc_bootmem(chunk_size);
143 if (!res) 143 BUG_ON(!res);
144 return NULL;
145 prom_early_allocated += chunk_size; 144 prom_early_allocated += chunk_size;
146 memset(res, 0, chunk_size); 145 memset(res, 0, chunk_size);
147 free_mem = chunk_size; 146 free_mem = chunk_size;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index df58e9cad96a..7cb6424317f6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/debugfs.h> 11#include <linux/debugfs.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/delay.h>
14 15
15#include <asm/mmu_context.h> 16#include <asm/mmu_context.h>
16#include <asm/uv/uv.h> 17#include <asm/uv/uv.h>
@@ -1364,11 +1365,11 @@ uv_activation_descriptor_init(int node, int pnode)
1364 memset(bd2, 0, sizeof(struct bau_desc)); 1365 memset(bd2, 0, sizeof(struct bau_desc));
1365 bd2->header.sw_ack_flag = 1; 1366 bd2->header.sw_ack_flag = 1;
1366 /* 1367 /*
1367 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub 1368 * base_dest_nodeid is the nasid of the first uvhub
1368 * in the partition. The bit map will indicate uvhub numbers, 1369 * in the partition. The bit map will indicate uvhub numbers,
1369 * which are 0-N in a partition. Pnodes are unique system-wide. 1370 * which are 0-N in a partition. Pnodes are unique system-wide.
1370 */ 1371 */
1371 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1372 bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
1372 bd2->header.dest_subnodeid = 0x10; /* the LB */ 1373 bd2->header.dest_subnodeid = 0x10; /* the LB */
1373 bd2->header.command = UV_NET_ENDPOINT_INTD; 1374 bd2->header.command = UV_NET_ENDPOINT_INTD;
1374 bd2->header.int_both = 1; 1375 bd2->header.int_both = 1;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460917d5..374a05d8ad22 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
131 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
132{ 132{
133 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
134 struct irq_cfg *cfg = get_irq_chip_data(irq); 134 struct irq_cfg *cfg = irq_get_chip_data(irq);
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
@@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
148 else 148 else
149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
150 150
151 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
152 irq_name); 152 irq_name);
153 153
154 mmr_value = 0; 154 mmr_value = 0;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 632037671746..c7abf13a213f 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -471,15 +471,7 @@ static unsigned int startup_piix4_master_irq(struct irq_data *data)
471{ 471{
472 legacy_pic->init(0); 472 legacy_pic->init(0);
473 enable_cobalt_irq(data); 473 enable_cobalt_irq(data);
474} 474 return 0;
475
476static void end_piix4_master_irq(struct irq_data *data)
477{
478 unsigned long flags;
479
480 spin_lock_irqsave(&cobalt_lock, flags);
481 enable_cobalt_irq(data);
482 spin_unlock_irqrestore(&cobalt_lock, flags);
483} 475}
484 476
485static struct irq_chip piix4_master_irq_type = { 477static struct irq_chip piix4_master_irq_type = {
@@ -492,7 +484,7 @@ static void pii4_mask(struct irq_data *data) { }
492 484
493static struct irq_chip piix4_virtual_irq_type = { 485static struct irq_chip piix4_virtual_irq_type = {
494 .name = "PIIX4-virtual", 486 .name = "PIIX4-virtual",
495 .mask = pii4_mask, 487 .irq_mask = pii4_mask,
496}; 488};
497 489
498/* 490/*
@@ -569,18 +561,20 @@ out_unlock:
569static struct irqaction master_action = { 561static struct irqaction master_action = {
570 .handler = piix4_master_intr, 562 .handler = piix4_master_intr,
571 .name = "PIIX4-8259", 563 .name = "PIIX4-8259",
564 .flags = IRQF_NO_THREAD,
572}; 565};
573 566
574static struct irqaction cascade_action = { 567static struct irqaction cascade_action = {
575 .handler = no_action, 568 .handler = no_action,
576 .name = "cascade", 569 .name = "cascade",
570 .flags = IRQF_NO_THREAD,
577}; 571};
578 572
579static inline void set_piix4_virtual_irq_type(void) 573static inline void set_piix4_virtual_irq_type(void)
580{ 574{
581 piix4_virtual_irq_type.enable = i8259A_chip.unmask; 575 piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask;
582 piix4_virtual_irq_type.disable = i8259A_chip.mask; 576 piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask;
583 piix4_virtual_irq_type.unmask = i8259A_chip.unmask; 577 piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask;
584} 578}
585 579
586static void __init visws_pre_intr_init(void) 580static void __init visws_pre_intr_init(void)
@@ -597,7 +591,7 @@ static void __init visws_pre_intr_init(void)
597 else if (i == CO_IRQ_IDE0) 591 else if (i == CO_IRQ_IDE0)
598 chip = &cobalt_irq_type; 592 chip = &cobalt_irq_type;
599 else if (i == CO_IRQ_IDE1) 593 else if (i == CO_IRQ_IDE1)
600 >chip = &cobalt_irq_type; 594 chip = &cobalt_irq_type;
601 else if (i == CO_IRQ_8259) 595 else if (i == CO_IRQ_8259)
602 chip = &piix4_master_irq_type; 596 chip = &piix4_master_irq_type;
603 else if (i < CO_IRQ_APIC0) 597 else if (i < CO_IRQ_APIC0)
@@ -606,7 +600,7 @@ static void __init visws_pre_intr_init(void)
606 chip = &cobalt_irq_type; 600 chip = &cobalt_irq_type;
607 601
608 if (chip) 602 if (chip)
609 set_irq_chip(i, chip); 603 irq_set_chip(i, chip);
610 } 604 }
611 605
612 setup_irq(CO_IRQ_8259, &master_action); 606 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 36df991985b2..468d591dde31 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma)
417 return NULL; 417 return NULL;
418} 418}
419 419
420struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 420struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
421{ 421{
422 struct mm_struct *mm = tsk->mm; 422 /*
423 423 * Check to see if the corresponding task was created in compat vdso
424 /* Check to see if this task was created in compat vdso mode */ 424 * mode.
425 */
425 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) 426 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
426 return &gate_vma; 427 return &gate_vma;
427 return NULL; 428 return NULL;
428} 429}
429 430
430int in_gate_area(struct task_struct *task, unsigned long addr) 431int in_gate_area(struct mm_struct *mm, unsigned long addr)
431{ 432{
432 const struct vm_area_struct *vma = get_gate_vma(task); 433 const struct vm_area_struct *vma = get_gate_vma(mm);
433 434
434 return vma && addr >= vma->vm_start && addr < vma->vm_end; 435 return vma && addr >= vma->vm_start && addr < vma->vm_end;
435} 436}
436 437
437int in_gate_area_no_task(unsigned long addr) 438int in_gate_area_no_mm(unsigned long addr)
438{ 439{
439 return 0; 440 return 0;
440} 441}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892e4bc3..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -38,7 +38,8 @@ config XEN_MAX_DOMAIN_MEMORY
38 38
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN && PM 41 depends on XEN
42 select HIBERNATE_CALLBACKS
42 default y 43 default y
43 44
44config XEN_DEBUG_FS 45config XEN_DEBUG_FS
@@ -48,3 +49,11 @@ config XEN_DEBUG_FS
48 help 49 help
49 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45fb..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
238static __init void xen_init_cpuid_mask(void) 238static __init void xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
249 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
250 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
251 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
252
253 ax = 1; 253 ax = 1;
254 cx = 0;
255 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
256 255
257 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
258 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
259 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
260
261 set_in_cr4(X86_CR4_OSXSAVE);
262
263 cr4 = read_cr4();
264
265 if ((cr4 & X86_CR4_OSXSAVE) == 0)
266 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
267 259
268 clear_in_cr4(X86_CR4_OSXSAVE); 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
269 } 261 if ((cx & xsave_mask) != xsave_mask)
262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
270} 263}
271 264
272static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -1284,15 +1277,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1284 1277
1285 xen_setup_features(); 1278 xen_setup_features();
1286 1279
1287 pv_info = xen_info; 1280 pv_info.name = "Xen HVM";
1288 pv_info.kernel_rpl = 0;
1289 1281
1290 xen_domain_type = XEN_HVM_DOMAIN; 1282 xen_domain_type = XEN_HVM_DOMAIN;
1291 1283
1292 return 0; 1284 return 0;
1293} 1285}
1294 1286
1295void xen_hvm_init_shared_info(void) 1287void __ref xen_hvm_init_shared_info(void)
1296{ 1288{
1297 int cpu; 1289 int cpu;
1298 struct xen_add_to_physmap xatp; 1290 struct xen_add_to_physmap xatp;
@@ -1331,6 +1323,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1331 switch (action) { 1323 switch (action) {
1332 case CPU_UP_PREPARE: 1324 case CPU_UP_PREPARE:
1333 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1326 if (xen_have_vector_callback)
1327 xen_init_lock_cpu(cpu);
1334 break; 1328 break;
1335 default: 1329 default:
1336 break; 1330 break;
@@ -1355,6 +1349,7 @@ static void __init xen_hvm_guest_init(void)
1355 1349
1356 if (xen_feature(XENFEAT_hvm_callback_vector)) 1350 if (xen_feature(XENFEAT_hvm_callback_vector))
1357 xen_have_vector_callback = 1; 1351 xen_have_vector_callback = 1;
1352 xen_hvm_smp_init();
1358 register_cpu_notifier(&xen_hvm_cpu_notifier); 1353 register_cpu_notifier(&xen_hvm_cpu_notifier);
1359 xen_unplug_emulated_devices(); 1354 xen_unplug_emulated_devices();
1360 have_vcpu_info_placement = 0; 1355 have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61ad574..55c965b38c27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h>
49 50
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -78,8 +79,7 @@
78 79
79/* 80/*
80 * Protects atomic reservation decrease/increase against concurrent increases. 81 * Protects atomic reservation decrease/increase against concurrent increases.
81 * Also protects non-atomic updates of current_pages and driver_pages, and 82 * Also protects non-atomic updates of current_pages and balloon lists.
82 * balloon lists.
83 */ 83 */
84DEFINE_SPINLOCK(xen_reservation_lock); 84DEFINE_SPINLOCK(xen_reservation_lock);
85 85
@@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
416 if (val & _PAGE_PRESENT) { 416 if (val & _PAGE_PRESENT) {
417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 pteval_t flags = val & PTE_FLAGS_MASK; 418 pteval_t flags = val & PTE_FLAGS_MASK;
419 unsigned long mfn = pfn_to_mfn(pfn); 419 unsigned long mfn;
420 420
421 if (!xen_feature(XENFEAT_auto_translated_physmap))
422 mfn = get_phys_to_machine(pfn);
423 else
424 mfn = pfn;
421 /* 425 /*
422 * If there's no mfn for the pfn, then just create an 426 * If there's no mfn for the pfn, then just create an
423 * empty non-present pte. Unfortunately this loses 427 * empty non-present pte. Unfortunately this loses
@@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
427 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 431 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 mfn = 0; 432 mfn = 0;
429 flags = 0; 433 flags = 0;
434 } else {
435 /*
436 * Paramount to do this test _after_ the
437 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
438 * IDENTITY_FRAME_BIT resolves to true.
439 */
440 mfn &= ~FOREIGN_FRAME_BIT;
441 if (mfn & IDENTITY_FRAME_BIT) {
442 mfn &= ~IDENTITY_FRAME_BIT;
443 flags |= _PAGE_IOMAP;
444 }
430 } 445 }
431
432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 446 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 } 447 }
434 448
@@ -532,6 +546,41 @@ pte_t xen_make_pte(pteval_t pte)
532} 546}
533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 547PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 548
549#ifdef CONFIG_XEN_DEBUG
550pte_t xen_make_pte_debug(pteval_t pte)
551{
552 phys_addr_t addr = (pte & PTE_PFN_MASK);
553 phys_addr_t other_addr;
554 bool io_page = false;
555 pte_t _pte;
556
557 if (pte & _PAGE_IOMAP)
558 io_page = true;
559
560 _pte = xen_make_pte(pte);
561
562 if (!addr)
563 return _pte;
564
565 if (io_page &&
566 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
567 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
568 WARN_ONCE(addr != other_addr,
569 "0x%lx is using VM_IO, but it is 0x%lx!\n",
570 (unsigned long)addr, (unsigned long)other_addr);
571 } else {
572 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
573 other_addr = (_pte.pte & PTE_PFN_MASK);
574 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
575 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
576 (unsigned long)addr);
577 }
578
579 return _pte;
580}
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif
583
535pgd_t xen_make_pgd(pgdval_t pgd) 584pgd_t xen_make_pgd(pgdval_t pgd)
536{ 585{
537 pgd = pte_pfn_to_mfn(pgd); 586 pgd = pte_pfn_to_mfn(pgd);
@@ -986,10 +1035,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
986 */ 1035 */
987void xen_mm_pin_all(void) 1036void xen_mm_pin_all(void)
988{ 1037{
989 unsigned long flags;
990 struct page *page; 1038 struct page *page;
991 1039
992 spin_lock_irqsave(&pgd_lock, flags); 1040 spin_lock(&pgd_lock);
993 1041
994 list_for_each_entry(page, &pgd_list, lru) { 1042 list_for_each_entry(page, &pgd_list, lru) {
995 if (!PagePinned(page)) { 1043 if (!PagePinned(page)) {
@@ -998,7 +1046,7 @@ void xen_mm_pin_all(void)
998 } 1046 }
999 } 1047 }
1000 1048
1001 spin_unlock_irqrestore(&pgd_lock, flags); 1049 spin_unlock(&pgd_lock);
1002} 1050}
1003 1051
1004/* 1052/*
@@ -1099,10 +1147,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
1099 */ 1147 */
1100void xen_mm_unpin_all(void) 1148void xen_mm_unpin_all(void)
1101{ 1149{
1102 unsigned long flags;
1103 struct page *page; 1150 struct page *page;
1104 1151
1105 spin_lock_irqsave(&pgd_lock, flags); 1152 spin_lock(&pgd_lock);
1106 1153
1107 list_for_each_entry(page, &pgd_list, lru) { 1154 list_for_each_entry(page, &pgd_list, lru) {
1108 if (PageSavePinned(page)) { 1155 if (PageSavePinned(page)) {
@@ -1112,7 +1159,7 @@ void xen_mm_unpin_all(void)
1112 } 1159 }
1113 } 1160 }
1114 1161
1115 spin_unlock_irqrestore(&pgd_lock, flags); 1162 spin_unlock(&pgd_lock);
1116} 1163}
1117 1164
1118void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1165void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
@@ -1416,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
1416 return ret; 1463 return ret;
1417} 1464}
1418 1465
1466#ifdef CONFIG_X86_64
1467static __initdata u64 __last_pgt_set_rw = 0;
1468static __initdata u64 __pgt_buf_start = 0;
1469static __initdata u64 __pgt_buf_end = 0;
1470static __initdata u64 __pgt_buf_top = 0;
1471/*
1472 * As a consequence of the commit:
1473 *
1474 * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
1475 * Author: Yinghai Lu <yinghai@kernel.org>
1476 * Date: Fri Dec 17 16:58:28 2010 -0800
1477 *
1478 * x86-64, mm: Put early page table high
1479 *
1480 * at some point init_memory_mapping is going to reach the pagetable pages
1481 * area and map those pages too (mapping them as normal memory that falls
1482 * in the range of addresses passed to init_memory_mapping as argument).
1483 * Some of those pages are already pagetable pages (they are in the range
1484 * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
1485 * everything is fine.
1486 * Some of these pages are not pagetable pages yet (they fall in the range
1487 * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
1488 * are going to be mapped RW. When these pages become pagetable pages and
1489 * are hooked into the pagetable, xen will find that the guest has already
1490 * a RW mapping of them somewhere and fail the operation.
1491 * The reason Xen requires pagetables to be RO is that the hypervisor needs
1492 * to verify that the pagetables are valid before using them. The validation
1493 * operations are called "pinning".
1494 *
1495 * In order to fix the issue we mark all the pages in the entire range
1496 * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
1497 * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
1498 * init_memory_mapping. Hence the kernel is going to crash as soon as one
1499 * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
1500 * ranges are RO).
1501 *
1502 * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
1503 * the init_memory_mapping has completed (in a perfect world we would
1504 * call this function from init_memory_mapping, but lets ignore that).
1505 *
1506 * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
1507 * end,top] have all changed to new values (b/c init_memory_mapping
1508 * is called and setting up another new page-table). Hence, the first time
1509 * we enter this function, we save away the pgt_buf_start value and update
1510 * the pgt_buf_[end,top].
1511 *
1512 * When we detect that the "old" pgt_buf_start through pgt_buf_end
1513 * PFNs have been reserved (so memblock_x86_reserve_range has been called),
1514 * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
1515 *
1516 * And then we update those "old" pgt_buf_[end|top] with the new ones
1517 * so that we can redo this on the next pagetable.
1518 */
1519static __init void mark_rw_past_pgt(void) {
1520
1521 if (pgt_buf_end > pgt_buf_start) {
1522 u64 addr, size;
1523
1524 /* Save it away. */
1525 if (!__pgt_buf_start) {
1526 __pgt_buf_start = pgt_buf_start;
1527 __pgt_buf_end = pgt_buf_end;
1528 __pgt_buf_top = pgt_buf_top;
1529 return;
1530 }
1531 /* If we get the range that starts at __pgt_buf_end that means
1532 * the range is reserved, and that in 'init_memory_mapping'
1533 * the 'memblock_x86_reserve_range' has been called with the
1534 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
1535 * pgt_buf_[start|end|top] refer now to a new pagetable.
1536 * Note: we are called _after_ the pgt_buf_[..] have been
1537 * updated.*/
1538
1539 addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
1540 &size, PAGE_SIZE);
1541
1542 /* Still not reserved, meaning 'memblock_x86_reserve_range'
1543 * hasn't been called yet. Update the _end and _top.*/
1544 if (addr == PFN_PHYS(__pgt_buf_start)) {
1545 __pgt_buf_end = pgt_buf_end;
1546 __pgt_buf_top = pgt_buf_top;
1547 return;
1548 }
1549
1550 /* OK, the area is reserved, meaning it is time for us to
1551 * set RW for the old end->top PFNs. */
1552
1553 /* ..unless we had already done this. */
1554 if (__pgt_buf_end == __last_pgt_set_rw)
1555 return;
1556
1557 addr = PFN_PHYS(__pgt_buf_end);
1558
1559 /* set as RW the rest */
1560 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
1561 PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
1562
1563 while (addr < PFN_PHYS(__pgt_buf_top)) {
1564 make_lowmem_page_readwrite(__va(addr));
1565 addr += PAGE_SIZE;
1566 }
1567 /* And update everything so that we are ready for the next
1568 * pagetable (the one created for regions past 4GB) */
1569 __last_pgt_set_rw = __pgt_buf_end;
1570 __pgt_buf_start = pgt_buf_start;
1571 __pgt_buf_end = pgt_buf_end;
1572 __pgt_buf_top = pgt_buf_top;
1573 }
1574 return;
1575}
1576#else
1577static __init void mark_rw_past_pgt(void) { }
1578#endif
1419static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1579static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1420{ 1580{
1421#ifdef CONFIG_X86_64 1581#ifdef CONFIG_X86_64
@@ -1426,28 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1426#endif 1586#endif
1427} 1587}
1428 1588
1589#ifdef CONFIG_X86_32
1429static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1590static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1430{ 1591{
1431 unsigned long pfn = pte_pfn(pte);
1432
1433#ifdef CONFIG_X86_32
1434 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1592 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1435 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1593 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1436 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1594 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1437 pte_val_ma(pte)); 1595 pte_val_ma(pte));
1438#endif 1596
1597 return pte;
1598}
1599#else /* CONFIG_X86_64 */
1600static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1601{
1602 unsigned long pfn = pte_pfn(pte);
1439 1603
1440 /* 1604 /*
1605 * A bit of optimization. We do not need to call the workaround
1606 * when xen_set_pte_init is called with a PTE with 0 as PFN.
1607 * That is b/c the pagetable at that point are just being populated
1608 * with empty values and we can save some cycles by not calling
1609 * the 'memblock' code.*/
1610 if (pfn)
1611 mark_rw_past_pgt();
1612 /*
1441 * If the new pfn is within the range of the newly allocated 1613 * If the new pfn is within the range of the newly allocated
1442 * kernel pagetable, and it isn't being mapped into an 1614 * kernel pagetable, and it isn't being mapped into an
1443 * early_ioremap fixmap slot, make sure it is RO. 1615 * early_ioremap fixmap slot as a freshly allocated page, make sure
1616 * it is RO.
1444 */ 1617 */
1445 if (!is_early_ioremap_ptep(ptep) && 1618 if (((!is_early_ioremap_ptep(ptep) &&
1446 pfn >= e820_table_start && pfn < e820_table_end) 1619 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1620 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1447 pte = pte_wrprotect(pte); 1621 pte = pte_wrprotect(pte);
1448 1622
1449 return pte; 1623 return pte;
1450} 1624}
1625#endif /* CONFIG_X86_64 */
1451 1626
1452/* Init-time set_pte while constructing initial pagetables, which 1627/* Init-time set_pte while constructing initial pagetables, which
1453 doesn't allow RO pagetable pages to be remapped RW */ 1628 doesn't allow RO pagetable pages to be remapped RW */
@@ -1653,9 +1828,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1653 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1828 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1654 pte_t pte; 1829 pte_t pte;
1655 1830
1656 if (pfn > max_pfn_mapped)
1657 max_pfn_mapped = pfn;
1658
1659 if (!pte_none(pte_page[pteidx])) 1831 if (!pte_none(pte_page[pteidx]))
1660 continue; 1832 continue;
1661 1833
@@ -1697,7 +1869,7 @@ static void convert_pfn_mfn(void *v)
1697} 1869}
1698 1870
1699/* 1871/*
1700 * Set up the inital kernel pagetable. 1872 * Set up the initial kernel pagetable.
1701 * 1873 *
1702 * We can construct this by grafting the Xen provided pagetable into 1874 * We can construct this by grafting the Xen provided pagetable into
1703 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1875 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1713,6 +1885,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1713 pud_t *l3; 1885 pud_t *l3;
1714 pmd_t *l2; 1886 pmd_t *l2;
1715 1887
1888 /* max_pfn_mapped is the last pfn mapped in the initial memory
1889 * mappings. Considering that on Xen after the kernel mappings we
1890 * have the mappings of some pages that don't exist in pfn space, we
1891 * set max_pfn_mapped to the last real pfn mapped. */
1892 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1893
1716 /* Zap identity mapping */ 1894 /* Zap identity mapping */
1717 init_level4_pgt[0] = __pgd(0); 1895 init_level4_pgt[0] = __pgd(0);
1718 1896
@@ -1817,9 +1995,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1817 initial_kernel_pmd = 1995 initial_kernel_pmd =
1818 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1996 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1819 1997
1820 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1998 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1821 xen_start_info->nr_pt_frames * PAGE_SIZE +
1822 512*1024);
1823 1999
1824 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 2000 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1825 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 2001 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1942,6 +2118,11 @@ __init void xen_ident_map_ISA(void)
1942 2118
1943static __init void xen_post_allocator_init(void) 2119static __init void xen_post_allocator_init(void)
1944{ 2120{
2121 mark_rw_past_pgt();
2122
2123#ifdef CONFIG_XEN_DEBUG
2124 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
2125#endif
1945 pv_mmu_ops.set_pte = xen_set_pte; 2126 pv_mmu_ops.set_pte = xen_set_pte;
1946 pv_mmu_ops.set_pmd = xen_set_pmd; 2127 pv_mmu_ops.set_pmd = xen_set_pmd;
1947 pv_mmu_ops.set_pud = xen_set_pud; 2128 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2074,7 +2255,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2074 in_frames[i] = virt_to_mfn(vaddr); 2255 in_frames[i] = virt_to_mfn(vaddr);
2075 2256
2076 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2257 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2077 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2258 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2078 2259
2079 if (out_frames) 2260 if (out_frames)
2080 out_frames[i] = virt_to_pfn(vaddr); 2261 out_frames[i] = virt_to_pfn(vaddr);
@@ -2353,6 +2534,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2353 2534
2354#ifdef CONFIG_XEN_DEBUG_FS 2535#ifdef CONFIG_XEN_DEBUG_FS
2355 2536
2537static int p2m_dump_open(struct inode *inode, struct file *filp)
2538{
2539 return single_open(filp, p2m_dump_show, NULL);
2540}
2541
2542static const struct file_operations p2m_dump_fops = {
2543 .open = p2m_dump_open,
2544 .read = seq_read,
2545 .llseek = seq_lseek,
2546 .release = single_release,
2547};
2548
2356static struct dentry *d_mmu_debug; 2549static struct dentry *d_mmu_debug;
2357 2550
2358static int __init xen_mmu_debugfs(void) 2551static int __init xen_mmu_debugfs(void)
@@ -2408,6 +2601,7 @@ static int __init xen_mmu_debugfs(void)
2408 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, 2601 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2409 &mmu_stats.prot_commit_batched); 2602 &mmu_stats.prot_commit_batched);
2410 2603
2604 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2411 return 0; 2605 return 0;
2412} 2606}
2413fs_initcall(xen_mmu_debugfs); 2607fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7ce7ff9..141eb0de8b06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
26 */ 149 */
27 150
28#include <linux/init.h> 151#include <linux/init.h>
@@ -30,6 +153,7 @@
30#include <linux/list.h> 153#include <linux/list.h>
31#include <linux/hash.h> 154#include <linux/hash.h>
32#include <linux/sched.h> 155#include <linux/sched.h>
156#include <linux/seq_file.h>
33 157
34#include <asm/cache.h> 158#include <asm/cache.h>
35#include <asm/setup.h> 159#include <asm/setup.h>
@@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); 183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
65static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
66{ 196{
67 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m)
136 * - After resume we're called from within stop_machine, but the mfn 266 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated. 267 * tree should alreay be completely allocated.
138 */ 268 */
139void xen_build_mfn_list_list(void) 269void __ref xen_build_mfn_list_list(void)
140{ 270{
141 unsigned long pfn; 271 unsigned long pfn;
142 272
@@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top); 352 p2m_top_init(p2m_top);
223 353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
224 /* 357 /*
225 * The domain builder gives us a pre-constructed p2m array in 358 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just 359 * mfn_list for all the pages initially given to us, so we just
@@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
266 mididx = p2m_mid_index(pfn); 399 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn); 400 idx = p2m_index(pfn);
268 401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
269 return p2m_top[topidx][mididx][idx]; 410 return p2m_top[topidx][mididx][idx];
270} 411}
271EXPORT_SYMBOL_GPL(get_phys_to_machine); 412EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn)
335 p2m_top_mfn_p[topidx] = mid_mfn; 476 p2m_top_mfn_p[topidx] = mid_mfn;
336 } 477 }
337 478
338 if (p2m_top[topidx][mididx] == p2m_missing) { 479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */ 481 /* p2m leaf page is missing */
340 unsigned long *p2m; 482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
341 484
342 p2m = alloc_p2m_page(); 485 p2m = alloc_p2m_page();
343 if (!p2m) 486 if (!p2m)
@@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn)
345 488
346 p2m_init(p2m); 489 p2m_init(p2m);
347 490
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) 491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
349 free_p2m_page(p2m); 492 free_p2m_page(p2m);
350 else 493 else
351 mid_mfn[mididx] = virt_to_mfn(p2m); 494 mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn)
354 return true; 497 return true;
355} 498}
356 499
500static bool __init __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525
526 p2m_init(p2m);
527
528 p2m_top[topidx][mididx] = p2m;
529
530 }
531 return idx != 0;
532}
533unsigned long __init set_phys_range_identity(unsigned long pfn_s,
534 unsigned long pfn_e)
535{
536 unsigned long pfn;
537
538 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
539 return 0;
540
541 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
542 return pfn_e - pfn_s;
543
544 if (pfn_s > pfn_e)
545 return 0;
546
547 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
548 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 {
551 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) {
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
556
557 p2m_top[topidx] = mid;
558 }
559 }
560
561 __early_alloc_p2m(pfn_s);
562 __early_alloc_p2m(pfn_e);
563
564 for (pfn = pfn_s; pfn < pfn_e; pfn++)
565 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
566 break;
567
568 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
569 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
570 (pfn_e - pfn_s) - (pfn - pfn_s)))
571 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
572
573 return pfn - pfn_s;
574}
575
357/* Try to install p2m mapping; fail if intermediate bits missing */ 576/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 577bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{ 578{
360 unsigned topidx, mididx, idx; 579 unsigned topidx, mididx, idx;
361 580
581 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
582 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
583 return true;
584 }
362 if (unlikely(pfn >= MAX_P2M_PFN)) { 585 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY); 586 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true; 587 return true;
@@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368 mididx = p2m_mid_index(pfn); 591 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn); 592 idx = p2m_index(pfn);
370 593
594 /* For sparse holes were the p2m leaf has real PFN along with
595 * PCI holes, stick in the PFN as the MFN value.
596 */
597 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
598 if (p2m_top[topidx][mididx] == p2m_identity)
599 return true;
600
601 /* Swap over from MISSING to IDENTITY if needed. */
602 if (p2m_top[topidx][mididx] == p2m_missing) {
603 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
604 p2m_identity) != p2m_missing);
605 return true;
606 }
607 }
608
371 if (p2m_top[topidx][mididx] == p2m_missing) 609 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY; 610 return mfn == INVALID_P2M_ENTRY;
373 611
@@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
378 616
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 617bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{ 618{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 619 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn)) 620 if (!alloc_p2m(pfn))
388 return false; 621 return false;
@@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
421{ 654{
422 unsigned long flags; 655 unsigned long flags;
423 unsigned long pfn; 656 unsigned long pfn;
424 unsigned long address; 657 unsigned long uninitialized_var(address);
425 unsigned level; 658 unsigned level;
426 pte_t *ptep = NULL; 659 pte_t *ptep = NULL;
427 660
@@ -438,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
438 page->private = mfn; 671 page->private = mfn;
439 page->index = pfn_to_mfn(pfn); 672 page->index = pfn_to_mfn(pfn);
440 673
441 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); 674 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
675 return -ENOMEM;
676
442 if (!PageHighMem(page)) 677 if (!PageHighMem(page))
443 /* Just zap old mapping for now */ 678 /* Just zap old mapping for now */
444 pte_clear(&init_mm, address, ptep); 679 pte_clear(&init_mm, address, ptep);
@@ -455,7 +690,7 @@ int m2p_remove_override(struct page *page)
455 unsigned long flags; 690 unsigned long flags;
456 unsigned long mfn; 691 unsigned long mfn;
457 unsigned long pfn; 692 unsigned long pfn;
458 unsigned long address; 693 unsigned long uninitialized_var(address);
459 unsigned level; 694 unsigned level;
460 pte_t *ptep = NULL; 695 pte_t *ptep = NULL;
461 696
@@ -476,7 +711,7 @@ int m2p_remove_override(struct page *page)
476 spin_lock_irqsave(&m2p_override_lock, flags); 711 spin_lock_irqsave(&m2p_override_lock, flags);
477 list_del(&page->lru); 712 list_del(&page->lru);
478 spin_unlock_irqrestore(&m2p_override_lock, flags); 713 spin_unlock_irqrestore(&m2p_override_lock, flags);
479 __set_phys_to_machine(pfn, page->index); 714 set_phys_to_machine(pfn, page->index);
480 715
481 if (!PageHighMem(page)) 716 if (!PageHighMem(page))
482 set_pte_at(&init_mm, address, ptep, 717 set_pte_at(&init_mm, address, ptep,
@@ -520,3 +755,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
520 return ret; 755 return ret;
521} 756}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 757EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
758
759#ifdef CONFIG_XEN_DEBUG_FS
760
761int p2m_dump_show(struct seq_file *m, void *v)
762{
763 static const char * const level_name[] = { "top", "middle",
764 "entry", "abnormal" };
765 static const char * const type_name[] = { "identity", "missing",
766 "pfn", "abnormal"};
767#define TYPE_IDENTITY 0
768#define TYPE_MISSING 1
769#define TYPE_PFN 2
770#define TYPE_UNKNOWN 3
771 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
772 unsigned int uninitialized_var(prev_level);
773 unsigned int uninitialized_var(prev_type);
774
775 if (!p2m_top)
776 return 0;
777
778 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
779 unsigned topidx = p2m_top_index(pfn);
780 unsigned mididx = p2m_mid_index(pfn);
781 unsigned idx = p2m_index(pfn);
782 unsigned lvl, type;
783
784 lvl = 4;
785 type = TYPE_UNKNOWN;
786 if (p2m_top[topidx] == p2m_mid_missing) {
787 lvl = 0; type = TYPE_MISSING;
788 } else if (p2m_top[topidx] == NULL) {
789 lvl = 0; type = TYPE_UNKNOWN;
790 } else if (p2m_top[topidx][mididx] == NULL) {
791 lvl = 1; type = TYPE_UNKNOWN;
792 } else if (p2m_top[topidx][mididx] == p2m_identity) {
793 lvl = 1; type = TYPE_IDENTITY;
794 } else if (p2m_top[topidx][mididx] == p2m_missing) {
795 lvl = 1; type = TYPE_MISSING;
796 } else if (p2m_top[topidx][mididx][idx] == 0) {
797 lvl = 2; type = TYPE_UNKNOWN;
798 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
799 lvl = 2; type = TYPE_IDENTITY;
800 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
801 lvl = 2; type = TYPE_MISSING;
802 } else if (p2m_top[topidx][mididx][idx] == pfn) {
803 lvl = 2; type = TYPE_PFN;
804 } else if (p2m_top[topidx][mididx][idx] != pfn) {
805 lvl = 2; type = TYPE_PFN;
806 }
807 if (pfn == 0) {
808 prev_level = lvl;
809 prev_type = type;
810 }
811 if (pfn == MAX_DOMAIN_PAGES-1) {
812 lvl = 3;
813 type = TYPE_UNKNOWN;
814 }
815 if (prev_type != type) {
816 seq_printf(m, " [0x%lx->0x%lx] %s\n",
817 prev_pfn_type, pfn, type_name[prev_type]);
818 prev_pfn_type = pfn;
819 prev_type = type;
820 }
821 if (prev_level != lvl) {
822 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
823 prev_pfn_level, pfn, level_name[prev_level]);
824 prev_pfn_level = pfn;
825 prev_level = lvl;
826 }
827 }
828 return 0;
829#undef TYPE_IDENTITY
830#undef TYPE_MISSING
831#undef TYPE_PFN
832#undef TYPE_UNKNOWN
833}
834#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a50d446..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static __init void xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn;
56
55 u64 size = (u64)pages * PAGE_SIZE; 57 u64 size = (u64)pages * PAGE_SIZE;
56 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57 59
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
66 xen_extra_mem_size += size; 68 xen_extra_mem_size += size;
67 69
68 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
69} 74}
70 75
71static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
104 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105 start, end, ret); 110 start, end, ret);
106 if (ret == 1) { 111 if (ret == 1) {
107 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108 len++; 113 len++;
109 } 114 }
110 } 115 }
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
138 return released; 143 return released;
139} 144}
140 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if (entry->type == E820_RAM) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
141/** 188/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 **/ 190 **/
144char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
145{ 192{
146 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
147 195
148 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
149 unsigned long long mem_end; 197 unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
151 struct xen_memory_map memmap; 199 struct xen_memory_map memmap;
152 unsigned long extra_pages = 0; 200 unsigned long extra_pages = 0;
153 unsigned long extra_limit; 201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
154 int i; 203 int i;
155 int op; 204 int op;
156 205
@@ -176,8 +225,9 @@ char * __init xen_memory_setup(void)
176 } 225 }
177 BUG_ON(rc); 226 BUG_ON(rc);
178 227
228 memcpy(map_raw, map, sizeof(map));
179 e820.nr_map = 0; 229 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 230 xen_extra_mem_start = max((1ULL << 32), mem_end);
181 for (i = 0; i < memmap.nr_entries; i++) { 231 for (i = 0; i < memmap.nr_entries; i++) {
182 unsigned long long end; 232 unsigned long long end;
183 233
@@ -194,6 +244,15 @@ char * __init xen_memory_setup(void)
194 end -= delta; 244 end -= delta;
195 245
196 extra_pages += PFN_DOWN(delta); 246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
197 } 256 }
198 257
199 if (map[i].size > 0 && end > xen_extra_mem_start) 258 if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +310,13 @@ char * __init xen_memory_setup(void)
251 310
252 xen_add_extra_mem(extra_pages); 311 xen_add_extra_mem(extra_pages);
253 312
313 /*
314 * Set P2M for all non-RAM pages and E820 gaps to be identity
315 * type PFNs. We supply it with the non-sanitized version
316 * of the E820.
317 */
318 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
319 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
254 return "Xen"; 320 return "Xen";
255} 321}
256 322
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c7959045..30612441ed99 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
509 xen_fill_possible_map(); 509 xen_fill_possible_map();
510 xen_init_spinlocks(); 510 xen_init_spinlocks();
511} 511}
512
513static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
514{
515 native_smp_prepare_cpus(max_cpus);
516 WARN_ON(xen_smp_intr_init(0));
517
518 if (!xen_have_vector_callback)
519 return;
520 xen_init_lock_cpu(0);
521 xen_init_spinlocks();
522}
523
524static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
525{
526 int rc;
527 rc = native_cpu_up(cpu);
528 WARN_ON (xen_smp_intr_init(cpu));
529 return rc;
530}
531
532static void xen_hvm_cpu_die(unsigned int cpu)
533{
534 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
535 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
536 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
537 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
538 native_cpu_die(cpu);
539}
540
541void __init xen_hvm_smp_init(void)
542{
543 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
544 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
545 smp_ops.cpu_up = xen_hvm_cpu_up;
546 smp_ops.cpu_die = xen_hvm_cpu_die;
547 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
548 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
549}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a129b5..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
37 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
38 } 39 }
39 } 40 }
41#endif
40} 42}
41 43
42void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
43{ 45{
44 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
45 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 04e11597a8c5..c532d280ce36 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -393,7 +393,9 @@ void xen_setup_timer(int cpu)
393 name = "<timer kasprintf failed>"; 393 name = "<timer kasprintf failed>";
394 394
395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
396 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 396 IRQF_DISABLED|IRQF_PERCPU|
397 IRQF_NOBALANCING|IRQF_TIMER|
398 IRQF_FORCE_RESUME,
397 name, NULL); 399 name, NULL);
398 400
399 evt = &per_cpu(xen_clock_events, cpu); 401 evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf985757..3112f55638c4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
67 68
68extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
69#else 70#else
70static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
71#endif 73#endif
72 74
73#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS