aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig82
-rw-r--r--arch/x86/Kconfig.cpu11
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c7
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S11
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c38
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/ia32/ia32entry.S33
-rw-r--r--arch/x86/include/asm/acpi.h20
-rw-r--r--arch/x86/include/asm/amd_nb.h24
-rw-r--r--arch/x86/include/asm/apic.h44
-rw-r--r--arch/x86/include/asm/apicdef.h12
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cacheflush.h44
-rw-r--r--arch/x86/include/asm/ce4100.h6
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/dma.h7
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h5
-rw-r--r--arch/x86/include/asm/frame.h6
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/gart.h24
-rw-r--r--arch/x86/include/asm/hw_irq.h24
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/init.h6
-rw-r--r--arch/x86/include/asm/io_apic.h44
-rw-r--r--arch/x86/include/asm/ipi.h8
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_controller.h12
-rw-r--r--arch/x86/include/asm/irq_vectors.h45
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kdebug.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h12
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h5
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/msr-index.h13
-rw-r--r--arch/x86/include/asm/nmi.h5
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/numa.h52
-rw-r--r--arch/x86/include/asm/numa_32.h9
-rw-r--r--arch/x86/include/asm/numa_64.h24
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h14
-rw-r--r--arch/x86/include/asm/page_types.h9
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/percpu.h86
-rw-r--r--arch/x86/include/asm/perf_event_p4.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h11
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h70
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/rwsem.h80
-rw-r--r--arch/x86/include/asm/segment.h12
-rw-r--r--arch/x86/include/asm/smp.h25
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h6
-rw-r--r--arch/x86/include/asm/system.h2
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/topology.h19
-rw-r--r--arch/x86/include/asm/trampoline.h33
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/types.h16
-rw-r--r--arch/x86/include/asm/unistd_32.h6
-rw-r--r--arch/x86/include/asm/unistd_64.h8
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h15
-rw-r--r--arch/x86/include/asm/xen/interface.h2
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/asm/xen/pci.h8
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S21
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c74
-rw-r--r--arch/x86/kernel/acpi/sleep.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c11
-rw-r--r--arch/x86/kernel/amd_iommu_init.c26
-rw-r--r--arch/x86/kernel/amd_nb.c102
-rw-r--r--arch/x86/kernel/apb_timer.c64
-rw-r--r--arch/x86/kernel/aperture_64.c37
-rw-r--r--arch/x86/kernel/apic/apic.c192
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c26
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c34
-rw-r--r--arch/x86/kernel/apic/es7000_32.c35
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c494
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/summit_32.c47
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c12
-rw-r--r--arch/x86/kernel/apm_32.c19
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c69
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c84
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c83
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event.c181
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c191
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c417
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c83
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/devicetree.c439
-rw-r--r--arch/x86/kernel/dumpstack.c51
-rw-r--r--arch/x86/kernel/dumpstack_32.c15
-rw-r--r--arch/x86/kernel/dumpstack_64.c16
-rw-r--r--arch/x86/kernel/e820.c19
-rw-r--r--arch/x86/kernel/early-quirks.c21
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S17
-rw-r--r--arch/x86/kernel/ftrace.c15
-rw-r--r--arch/x86/kernel/head32.c9
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S40
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8237.c30
-rw-r--r--arch/x86/kernel/i8259.c35
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c93
-rw-r--r--arch/x86/kernel/irq_32.c9
-rw-r--r--arch/x86/kernel/irqinit.c92
-rw-r--r--arch/x86/kernel/kgdb.c15
-rw-r--r--arch/x86/kernel/kprobes.c8
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c188
-rw-r--r--arch/x86/kernel/microcode_core.c41
-rw-r--r--arch/x86/kernel/mpparse.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/pci-gart_64.c41
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/reboot.c129
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c93
-rw-r--r--arch/x86/kernel/setup_percpu.c11
-rw-r--r--arch/x86/kernel/smpboot.c168
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S28
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/verify_cpu.S2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S18
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c150
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c31
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/trace.h8
-rw-r--r--arch/x86/kvm/vmx.c128
-rw-r--r--arch/x86/kvm/x86.c192
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c8
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S6
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S6
-rw-r--r--arch/x86/lib/checksum_32.S63
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S65
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/csum-copy_64.S242
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/lib/memmove_64.S197
-rw-r--r--arch/x86/lib/memmove_64.c192
-rw-r--r--arch/x86/lib/rwsem_64.S56
-rw-r--r--arch/x86/lib/semaphore_32.S38
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S27
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology_64.c142
-rw-r--r--arch/x86/mm/fault.c14
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init.c56
-rw-r--r--arch/x86/mm/init_32.c13
-rw-r--r--arch/x86/mm/init_64.c119
-rw-r--r--arch/x86/mm/numa.c234
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c996
-rw-r--r--arch/x86/mm/numa_emulation.c494
-rw-r--r--arch/x86/mm/numa_internal.h31
-rw-r--r--arch/x86/mm/pageattr.c28
-rw-r--r--arch/x86/mm/pgtable.c14
-rw-r--r--arch/x86/mm/srat_32.c11
-rw-r--r--arch/x86/mm/srat_64.c367
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c49
-rw-r--r--arch/x86/oprofile/op_counter.h1
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/ce4100.c9
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/irq.c15
-rw-r--r--arch/x86/pci/xen.c192
-rw-r--r--arch/x86/platform/ce4100/ce4100.c26
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts430
-rw-r--r--arch/x86/platform/mrst/mrst.c12
-rw-r--r--arch/x86/platform/mrst/vrtc.c18
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c25
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c3
-rw-r--r--arch/x86/platform/uv/tlb_uv.c5
-rw-r--r--arch/x86/platform/uv/uv_irq.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c24
-rw-r--r--arch/x86/vdso/vdso32-setup.c15
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/enlighten.c31
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c121
-rw-r--r--arch/x86/xen/p2m.c348
-rw-r--r--arch/x86/xen/setup.c78
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/suspend.c8
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
262 files changed, 6966 insertions, 4627 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3ed5ad92b029..cc6c53a95bfd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,9 +64,14 @@ config X86
64 select HAVE_TEXT_POKE_SMP 64 select HAVE_TEXT_POKE_SMP
65 select HAVE_GENERIC_HARDIRQS 65 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
67 select GENERIC_FIND_FIRST_BIT
68 select GENERIC_FIND_NEXT_BIT
67 select GENERIC_IRQ_PROBE 69 select GENERIC_IRQ_PROBE
68 select GENERIC_PENDING_IRQ if SMP 70 select GENERIC_PENDING_IRQ if SMP
71 select GENERIC_IRQ_SHOW
72 select IRQ_FORCED_THREADING
69 select USE_GENERIC_SMP_HELPERS if SMP 73 select USE_GENERIC_SMP_HELPERS if SMP
74 select ARCH_NO_SYSDEV_OPS
70 75
71config INSTRUCTION_DECODER 76config INSTRUCTION_DECODER
72 def_bool (KPROBES || PERF_EVENTS) 77 def_bool (KPROBES || PERF_EVENTS)
@@ -119,7 +124,7 @@ config NEED_SG_DMA_LENGTH
119 def_bool y 124 def_bool y
120 125
121config GENERIC_ISA_DMA 126config GENERIC_ISA_DMA
122 def_bool y 127 def_bool ISA_DMA_API
123 128
124config GENERIC_IOMAP 129config GENERIC_IOMAP
125 def_bool y 130 def_bool y
@@ -139,7 +144,7 @@ config GENERIC_GPIO
139 bool 144 bool
140 145
141config ARCH_MAY_HAVE_PC_FDC 146config ARCH_MAY_HAVE_PC_FDC
142 def_bool y 147 def_bool ISA_DMA_API
143 148
144config RWSEM_GENERIC_SPINLOCK 149config RWSEM_GENERIC_SPINLOCK
145 def_bool !X86_XADD 150 def_bool !X86_XADD
@@ -217,10 +222,6 @@ config X86_HT
217 def_bool y 222 def_bool y
218 depends on SMP 223 depends on SMP
219 224
220config X86_TRAMPOLINE
221 def_bool y
222 depends on SMP || (64BIT && ACPI_SLEEP)
223
224config X86_32_LAZY_GS 225config X86_32_LAZY_GS
225 def_bool y 226 def_bool y
226 depends on X86_32 && !CC_STACKPROTECTOR 227 depends on X86_32 && !CC_STACKPROTECTOR
@@ -382,6 +383,8 @@ config X86_INTEL_CE
382 depends on X86_32 383 depends on X86_32
383 depends on X86_EXTENDED_PLATFORM 384 depends on X86_EXTENDED_PLATFORM
384 select X86_REBOOTFIXUPS 385 select X86_REBOOTFIXUPS
386 select OF
387 select OF_EARLY_FLATTREE
385 ---help--- 388 ---help---
386 Select for the Intel CE media processor (CE4100) SOC. 389 Select for the Intel CE media processor (CE4100) SOC.
387 This option compiles in support for the CE4100 SOC for settop 390 This option compiles in support for the CE4100 SOC for settop
@@ -627,11 +630,11 @@ config APB_TIMER
627 as it is off-chip. APB timers are always running regardless of CPU 630 as it is off-chip. APB timers are always running regardless of CPU
628 C states, they are used as per CPU clockevent device when possible. 631 C states, they are used as per CPU clockevent device when possible.
629 632
630# Mark as embedded because too many people got it wrong. 633# Mark as expert because too many people got it wrong.
631# The code disables itself when not needed. 634# The code disables itself when not needed.
632config DMI 635config DMI
633 default y 636 default y
634 bool "Enable DMI scanning" if EMBEDDED 637 bool "Enable DMI scanning" if EXPERT
635 ---help--- 638 ---help---
636 Enabled scanning of DMI to identify machine quirks. Say Y 639 Enabled scanning of DMI to identify machine quirks. Say Y
637 here unless you have verified that your setup is not 640 here unless you have verified that your setup is not
@@ -639,7 +642,7 @@ config DMI
639 BIOS code. 642 BIOS code.
640 643
641config GART_IOMMU 644config GART_IOMMU
642 bool "GART IOMMU support" if EMBEDDED 645 bool "GART IOMMU support" if EXPERT
643 default y 646 default y
644 select SWIOTLB 647 select SWIOTLB
645 depends on X86_64 && PCI && AMD_NB 648 depends on X86_64 && PCI && AMD_NB
@@ -811,7 +814,7 @@ config X86_LOCAL_APIC
811 814
812config X86_IO_APIC 815config X86_IO_APIC
813 def_bool y 816 def_bool y
814 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 817 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
815 818
816config X86_VISWS_APIC 819config X86_VISWS_APIC
817 def_bool y 820 def_bool y
@@ -889,7 +892,7 @@ config X86_THERMAL_VECTOR
889 depends on X86_MCE_INTEL 892 depends on X86_MCE_INTEL
890 893
891config VM86 894config VM86
892 bool "Enable VM86 support" if EMBEDDED 895 bool "Enable VM86 support" if EXPERT
893 default y 896 default y
894 depends on X86_32 897 depends on X86_32
895 ---help--- 898 ---help---
@@ -1073,7 +1076,7 @@ endchoice
1073 1076
1074choice 1077choice
1075 depends on EXPERIMENTAL 1078 depends on EXPERIMENTAL
1076 prompt "Memory split" if EMBEDDED 1079 prompt "Memory split" if EXPERT
1077 default VMSPLIT_3G 1080 default VMSPLIT_3G
1078 depends on X86_32 1081 depends on X86_32
1079 ---help--- 1082 ---help---
@@ -1135,7 +1138,7 @@ config ARCH_DMA_ADDR_T_64BIT
1135 def_bool X86_64 || HIGHMEM64G 1138 def_bool X86_64 || HIGHMEM64G
1136 1139
1137config DIRECT_GBPAGES 1140config DIRECT_GBPAGES
1138 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1141 bool "Enable 1GB pages for kernel pagetables" if EXPERT
1139 default y 1142 default y
1140 depends on X86_64 1143 depends on X86_64
1141 ---help--- 1144 ---help---
@@ -1369,7 +1372,7 @@ config MATH_EMULATION
1369 1372
1370config MTRR 1373config MTRR
1371 def_bool y 1374 def_bool y
1372 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1375 prompt "MTRR (Memory Type Range Register) support" if EXPERT
1373 ---help--- 1376 ---help---
1374 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1377 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1375 the Memory Type Range Registers (MTRRs) may be used to control 1378 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1435,7 +1438,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1435 1438
1436config X86_PAT 1439config X86_PAT
1437 def_bool y 1440 def_bool y
1438 prompt "x86 PAT support" if EMBEDDED 1441 prompt "x86 PAT support" if EXPERT
1439 depends on MTRR 1442 depends on MTRR
1440 ---help--- 1443 ---help---
1441 Use PAT attributes to setup page level cache control. 1444 Use PAT attributes to setup page level cache control.
@@ -1539,7 +1542,7 @@ config KEXEC_JUMP
1539 code in physical address mode via KEXEC 1542 code in physical address mode via KEXEC
1540 1543
1541config PHYSICAL_START 1544config PHYSICAL_START
1542 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1545 hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
1543 default "0x1000000" 1546 default "0x1000000"
1544 ---help--- 1547 ---help---
1545 This gives the physical address where the kernel is loaded. 1548 This gives the physical address where the kernel is loaded.
@@ -1705,7 +1708,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1705 depends on NUMA 1708 depends on NUMA
1706 1709
1707config USE_PERCPU_NUMA_NODE_ID 1710config USE_PERCPU_NUMA_NODE_ID
1708 def_bool X86_64 1711 def_bool y
1709 depends on NUMA 1712 depends on NUMA
1710 1713
1711menu "Power management and ACPI options" 1714menu "Power management and ACPI options"
@@ -1934,7 +1937,7 @@ config PCI_MMCONFIG
1934 depends on X86_64 && PCI && ACPI 1937 depends on X86_64 && PCI && ACPI
1935 1938
1936config PCI_CNB20LE_QUIRK 1939config PCI_CNB20LE_QUIRK
1937 bool "Read CNB20LE Host Bridge Windows" if EMBEDDED 1940 bool "Read CNB20LE Host Bridge Windows" if EXPERT
1938 default n 1941 default n
1939 depends on PCI && EXPERIMENTAL 1942 depends on PCI && EXPERIMENTAL
1940 help 1943 help
@@ -2000,9 +2003,13 @@ source "drivers/pci/pcie/Kconfig"
2000 2003
2001source "drivers/pci/Kconfig" 2004source "drivers/pci/Kconfig"
2002 2005
2003# x86_64 have no ISA slots, but do have ISA-style DMA. 2006# x86_64 have no ISA slots, but can have ISA-style DMA.
2004config ISA_DMA_API 2007config ISA_DMA_API
2005 def_bool y 2008 bool "ISA-style DMA support" if (X86_64 && EXPERT)
2009 default y
2010 help
2011 Enables ISA-style DMA support for devices requiring such controllers.
2012 If unsure, say Y.
2006 2013
2007if X86_32 2014if X86_32
2008 2015
@@ -2066,9 +2073,10 @@ config SCx200HR_TIMER
2066 2073
2067config OLPC 2074config OLPC
2068 bool "One Laptop Per Child support" 2075 bool "One Laptop Per Child support"
2076 depends on !X86_PAE
2069 select GPIOLIB 2077 select GPIOLIB
2070 select OLPC_OPENFIRMWARE 2078 select OF
2071 depends on !X86_64 && !X86_PAE 2079 select OF_PROMTREE if PROC_DEVICETREE
2072 ---help--- 2080 ---help---
2073 Add support for detecting the unique features of the OLPC 2081 Add support for detecting the unique features of the OLPC
2074 XO hardware. 2082 XO hardware.
@@ -2079,21 +2087,6 @@ config OLPC_XO1
2079 ---help--- 2087 ---help---
2080 Add support for non-essential features of the OLPC XO-1 laptop. 2088 Add support for non-essential features of the OLPC XO-1 laptop.
2081 2089
2082config OLPC_OPENFIRMWARE
2083 bool "Support for OLPC's Open Firmware"
2084 depends on !X86_64 && !X86_PAE
2085 default n
2086 select OF
2087 help
2088 This option adds support for the implementation of Open Firmware
2089 that is used on the OLPC XO-1 Children's Machine.
2090 If unsure, say N here.
2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2097endif # X86_32 2090endif # X86_32
2098 2091
2099config AMD_NB 2092config AMD_NB
@@ -2104,6 +2097,16 @@ source "drivers/pcmcia/Kconfig"
2104 2097
2105source "drivers/pci/hotplug/Kconfig" 2098source "drivers/pci/hotplug/Kconfig"
2106 2099
2100config RAPIDIO
2101 bool "RapidIO support"
2102 depends on PCI
2103 default n
2104 help
2105 If you say Y here, the kernel will include drivers and
2106 infrastructure code to support RapidIO interconnect devices.
2107
2108source "drivers/rapidio/Kconfig"
2109
2107endmenu 2110endmenu
2108 2111
2109 2112
@@ -2138,6 +2141,11 @@ config SYSVIPC_COMPAT
2138 def_bool y 2141 def_bool y
2139 depends on COMPAT && SYSVIPC 2142 depends on COMPAT && SYSVIPC
2140 2143
2144config KEYS_COMPAT
2145 bool
2146 depends on COMPAT && KEYS
2147 default y
2148
2141endmenu 2149endmenu
2142 2150
2143 2151
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 15588a0ef466..d161e939df62 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,11 +294,6 @@ config X86_GENERIC
294 294
295endif 295endif
296 296
297config X86_CPU
298 def_bool y
299 select GENERIC_FIND_FIRST_BIT
300 select GENERIC_FIND_NEXT_BIT
301
302# 297#
303# Define implied options from the CPU selection here 298# Define implied options from the CPU selection here
304config X86_INTERNODE_CACHE_SHIFT 299config X86_INTERNODE_CACHE_SHIFT
@@ -331,7 +326,7 @@ config X86_PPRO_FENCE
331 Old PentiumPro multiprocessor systems had errata that could cause 326 Old PentiumPro multiprocessor systems had errata that could cause
332 memory operations to violate the x86 ordering standard in rare cases. 327 memory operations to violate the x86 ordering standard in rare cases.
333 Enabling this option will attempt to work around some (but not all) 328 Enabling this option will attempt to work around some (but not all)
334 occurances of this problem, at the cost of much heavier spinlock and 329 occurrences of this problem, at the cost of much heavier spinlock and
335 memory barrier operations. 330 memory barrier operations.
336 331
337 If unsure, say n here. Even distro kernels should think twice before 332 If unsure, say n here. Even distro kernels should think twice before
@@ -371,7 +366,7 @@ config X86_INTEL_USERCOPY
371 366
372config X86_USE_PPRO_CHECKSUM 367config X86_USE_PPRO_CHECKSUM
373 def_bool y 368 def_bool y
374 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM 369 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
375 370
376config X86_USE_3DNOW 371config X86_USE_3DNOW
377 def_bool y 372 def_bool y
@@ -424,7 +419,7 @@ config X86_DEBUGCTLMSR
424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML 419 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
425 420
426menuconfig PROCESSOR_SELECT 421menuconfig PROCESSOR_SELECT
427 bool "Supported processor vendors" if EMBEDDED 422 bool "Supported processor vendors" if EXPERT
428 ---help--- 423 ---help---
429 This lets you choose what x86 vendor support code your kernel 424 This lets you choose what x86 vendor support code your kernel
430 will include. 425 will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 45143bbcfe5e..615e18810f48 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
32 32
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EXPERT
35 default y 35 default y
36 ---help--- 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
@@ -138,7 +138,7 @@ config DEBUG_NX_TEST
138 138
139config DOUBLEFAULT 139config DOUBLEFAULT
140 default y 140 default y
141 bool "Enable doublefault exception handler" if EMBEDDED 141 bool "Enable doublefault exception handler" if EXPERT
142 depends on X86_32 142 depends on X86_32
143 ---help--- 143 ---help---
144 This option allows trapping of rare doublefault exceptions that 144 This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 646aa78ba5fd..46a823882437 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -62,7 +62,12 @@ int main(int argc, char *argv[])
62 if (fseek(f, -4L, SEEK_END)) { 62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]); 63 perror(argv[1]);
64 } 64 }
65 fread(&olen, sizeof olen, 1, f); 65
66 if (fread(&olen, sizeof(olen), 1, f) != 1) {
67 perror(argv[1]);
68 return 1;
69 }
70
66 ilen = ftell(f); 71 ilen = ftell(f);
67 olen = getle32(&olen); 72 olen = getle32(&olen);
68 fclose(f); 73 fclose(f);
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 8fe2a4966b7a..be6d9e365a80 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt:
1346 and $15, %r13 # %r13 = arg4 (mod 16) 1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt 1347 je _multiple_of_16_bytes_decrypt
1348 1348
1349 # Handle the last <16 byte block seperately 1349 # Handle the last <16 byte block separately
1350 1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10 1352 movdqa SHUF_MASK(%rip), %xmm10
@@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt:
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11 1356 sub $16, %r11
1357 add %r13, %r11 1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block 1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12 1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12 1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
@@ -1607,11 +1607,12 @@ _zero_cipher_left_encrypt:
1607 and $15, %r13 # %r13 = arg4 (mod 16) 1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt 1608 je _multiple_of_16_bytes_encrypt
1609 1609
1610 # Handle the last <16 Byte block seperately 1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10 1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0 1613 PSHUFB_XMM %xmm10, %xmm0
1614 1614
1615
1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1616 sub $16, %r11 1617 sub $16, %r11
1617 add %r13, %r11 1618 add %r13, %r11
@@ -1634,7 +1635,9 @@ _zero_cipher_left_encrypt:
1634 # GHASH computation for the last <16 byte block 1635 # GHASH computation for the last <16 byte block
1635 sub %r13, %r11 1636 sub %r13, %r11
1636 add $16, %r11 1637 add $16, %r11
1637 PSHUFB_XMM %xmm10, %xmm1 1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1638 1641
1639 # shuffle xmm0 back to output as ciphertext 1642 # shuffle xmm0 back to output as ciphertext
1640 1643
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e1e60c7d5813..2577613fb32b 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -828,9 +828,15 @@ static int rfc4106_init(struct crypto_tfm *tfm)
828 struct cryptd_aead *cryptd_tfm; 828 struct cryptd_aead *cryptd_tfm;
829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) 829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); 830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
831 struct crypto_aead *cryptd_child;
832 struct aesni_rfc4106_gcm_ctx *child_ctx;
831 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); 833 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
832 if (IS_ERR(cryptd_tfm)) 834 if (IS_ERR(cryptd_tfm))
833 return PTR_ERR(cryptd_tfm); 835 return PTR_ERR(cryptd_tfm);
836
837 cryptd_child = cryptd_aead_child(cryptd_tfm);
838 child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
839 memcpy(child_ctx, ctx, sizeof(*ctx));
834 ctx->cryptd_tfm = cryptd_tfm; 840 ctx->cryptd_tfm = cryptd_tfm;
835 tfm->crt_aead.reqsize = sizeof(struct aead_request) 841 tfm->crt_aead.reqsize = sizeof(struct aead_request)
836 + crypto_aead_reqsize(&cryptd_tfm->base); 842 + crypto_aead_reqsize(&cryptd_tfm->base);
@@ -873,22 +879,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0); 879 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874 880
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); 881 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) { 882 if (ret)
877 crypto_free_ablkcipher(ctr_tfm); 883 goto out_free_ablkcipher;
878 return ret;
879 }
880 884
885 ret = -ENOMEM;
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); 886 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) { 887 if (!req)
883 crypto_free_ablkcipher(ctr_tfm); 888 goto out_free_ablkcipher;
884 return -EINVAL;
885 }
886 889
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); 890 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) { 891 if (!req_data)
889 crypto_free_ablkcipher(ctr_tfm); 892 goto out_free_request;
890 return -ENOMEM; 893
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv)); 894 memset(req_data->iv, 0, sizeof(req_data->iv));
893 895
894 /* Clear the data in the hash sub key container to zero.*/ 896 /* Clear the data in the hash sub key container to zero.*/
@@ -913,8 +915,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
913 if (!ret) 915 if (!ret)
914 ret = req_data->result.err; 916 ret = req_data->result.err;
915 } 917 }
916 ablkcipher_request_free(req);
917 kfree(req_data); 918 kfree(req_data);
919out_free_request:
920 ablkcipher_request_free(req);
921out_free_ablkcipher:
918 crypto_free_ablkcipher(ctr_tfm); 922 crypto_free_ablkcipher(ctr_tfm);
919 return ret; 923 return ret;
920} 924}
@@ -925,6 +929,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
925 int ret = 0; 929 int ret = 0;
926 struct crypto_tfm *tfm = crypto_aead_tfm(parent); 930 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
927 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); 931 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
932 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
933 struct aesni_rfc4106_gcm_ctx *child_ctx =
934 aesni_rfc4106_gcm_ctx_get(cryptd_child);
928 u8 *new_key_mem = NULL; 935 u8 *new_key_mem = NULL;
929 936
930 if (key_len < 4) { 937 if (key_len < 4) {
@@ -968,6 +975,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
968 goto exit; 975 goto exit;
969 } 976 }
970 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); 977 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
978 memcpy(child_ctx, ctx, sizeof(*ctx));
971exit: 979exit:
972 kfree(new_key_mem); 980 kfree(new_key_mem);
973 return ret; 981 return ret;
@@ -999,7 +1007,6 @@ static int rfc4106_encrypt(struct aead_request *req)
999 int ret; 1007 int ret;
1000 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1008 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1001 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 1009 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1002 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1003 1010
1004 if (!irq_fpu_usable()) { 1011 if (!irq_fpu_usable()) {
1005 struct aead_request *cryptd_req = 1012 struct aead_request *cryptd_req =
@@ -1008,6 +1015,7 @@ static int rfc4106_encrypt(struct aead_request *req)
1008 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); 1015 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1009 return crypto_aead_encrypt(cryptd_req); 1016 return crypto_aead_encrypt(cryptd_req);
1010 } else { 1017 } else {
1018 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1011 kernel_fpu_begin(); 1019 kernel_fpu_begin();
1012 ret = cryptd_child->base.crt_aead.encrypt(req); 1020 ret = cryptd_child->base.crt_aead.encrypt(req);
1013 kernel_fpu_end(); 1021 kernel_fpu_end();
@@ -1020,7 +1028,6 @@ static int rfc4106_decrypt(struct aead_request *req)
1020 int ret; 1028 int ret;
1021 struct crypto_aead *tfm = crypto_aead_reqtfm(req); 1029 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1022 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); 1030 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1023 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1024 1031
1025 if (!irq_fpu_usable()) { 1032 if (!irq_fpu_usable()) {
1026 struct aead_request *cryptd_req = 1033 struct aead_request *cryptd_req =
@@ -1029,6 +1036,7 @@ static int rfc4106_decrypt(struct aead_request *req)
1029 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); 1036 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1030 return crypto_aead_decrypt(cryptd_req); 1037 return crypto_aead_decrypt(cryptd_req);
1031 } else { 1038 } else {
1039 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1032 kernel_fpu_begin(); 1040 kernel_fpu_begin();
1033 ret = cryptd_child->base.crt_aead.decrypt(req); 1041 ret = cryptd_child->base.crt_aead.decrypt(req);
1034 kernel_fpu_end(); 1042 kernel_fpu_end();
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2d93bdbc9ac0..fd843877e841 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
298 /* OK, This is the point of no return */ 298 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 299 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 300 set_thread_flag(TIF_IA32);
301 current->mm->context.ia32_compat = 1;
301 302
302 setup_new_exec(bprm); 303 setup_new_exec(bprm);
303 304
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99c3394..849a9d23c71d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
25#define sysretl_audit ia32_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28 .section .entry.text, "ax"
29
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
29 31
30 .macro IA32_ARG_FIXUP noebp=0 32 .macro IA32_ARG_FIXUP noebp=0
@@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target)
126 */ 128 */
127 ENABLE_INTERRUPTS(CLBR_NONE) 129 ENABLE_INTERRUPTS(CLBR_NONE)
128 movl %ebp,%ebp /* zero extension */ 130 movl %ebp,%ebp /* zero extension */
129 pushq $__USER32_DS 131 pushq_cfi $__USER32_DS
130 CFI_ADJUST_CFA_OFFSET 8
131 /*CFI_REL_OFFSET ss,0*/ 132 /*CFI_REL_OFFSET ss,0*/
132 pushq %rbp 133 pushq_cfi %rbp
133 CFI_ADJUST_CFA_OFFSET 8
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq 135 pushfq_cfi
136 CFI_ADJUST_CFA_OFFSET 8
137 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
138 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
139 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
140 pushq $__USER32_CS 139 pushq_cfi $__USER32_CS
141 CFI_ADJUST_CFA_OFFSET 8
142 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
143 movl %eax, %eax 141 movl %eax, %eax
144 pushq %r10 142 pushq_cfi %r10
145 CFI_ADJUST_CFA_OFFSET 8
146 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
147 pushq %rax 144 pushq_cfi %rax
148 CFI_ADJUST_CFA_OFFSET 8
149 cld 145 cld
150 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,0,1
151 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
@@ -182,11 +178,9 @@ sysexit_from_sys_call:
182 xorq %r9,%r9 178 xorq %r9,%r9
183 xorq %r10,%r10 179 xorq %r10,%r10
184 xorq %r11,%r11 180 xorq %r11,%r11
185 popfq 181 popfq_cfi
186 CFI_ADJUST_CFA_OFFSET -8
187 /*CFI_RESTORE rflags*/ 182 /*CFI_RESTORE rflags*/
188 popq %rcx /* User %esp */ 183 popq_cfi %rcx /* User %esp */
189 CFI_ADJUST_CFA_OFFSET -8
190 CFI_REGISTER rsp,rcx 184 CFI_REGISTER rsp,rcx
191 TRACE_IRQS_ON 185 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS_SYSEXIT32 186 ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +415,7 @@ ENTRY(ia32_syscall)
421 */ 415 */
422 ENABLE_INTERRUPTS(CLBR_NONE) 416 ENABLE_INTERRUPTS(CLBR_NONE)
423 movl %eax,%eax 417 movl %eax,%eax
424 pushq %rax 418 pushq_cfi %rax
425 CFI_ADJUST_CFA_OFFSET 8
426 cld 419 cld
427 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
428 this could be a problem. */ 421 this could be a problem. */
@@ -851,4 +844,8 @@ ia32_sys_call_table:
851 .quad sys_fanotify_init 844 .quad sys_fanotify_init
852 .quad sys32_fanotify_mark 845 .quad sys32_fanotify_mark
853 .quad sys_prlimit64 /* 340 */ 846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
854ia32_syscall_end: 851ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 211ca3f7fd16..12e0e7dd869c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h>
32 33
33#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -88,6 +89,7 @@ extern int acpi_disabled;
88extern int acpi_pci_disabled; 89extern int acpi_pci_disabled;
89extern int acpi_skip_timer_override; 90extern int acpi_skip_timer_override;
90extern int acpi_use_timer_override; 91extern int acpi_use_timer_override;
92extern int acpi_fix_pin2_polarity;
91 93
92extern u8 acpi_sci_flags; 94extern u8 acpi_sci_flags;
93extern int acpi_sci_override_gsi; 95extern int acpi_sci_override_gsi;
@@ -112,11 +114,11 @@ static inline void acpi_disable_pci(void)
112 acpi_noirq_set(); 114 acpi_noirq_set();
113} 115}
114 116
115/* routines for saving/restoring kernel state */ 117/* Low-level suspend routine. */
116extern int acpi_save_state_mem(void); 118extern int acpi_suspend_lowlevel(void);
117extern void acpi_restore_state_mem(void);
118 119
119extern unsigned long acpi_wakeup_address; 120extern const unsigned char acpi_wakeup_code[];
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
120 122
121/* early initialization routine */ 123/* early initialization routine */
122extern void acpi_reserve_wakeup_memory(void); 124extern void acpi_reserve_wakeup_memory(void);
@@ -185,15 +187,7 @@ struct bootnode;
185 187
186#ifdef CONFIG_ACPI_NUMA 188#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 189extern int acpi_numa;
188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 190extern int x86_acpi_numa_init(void);
189 unsigned long end);
190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
192
193#ifdef CONFIG_NUMA_EMU
194extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
195 int num_nodes);
196#endif
197#endif /* CONFIG_ACPI_NUMA */ 191#endif /* CONFIG_ACPI_NUMA */
198 192
199#define acpi_unlazy_tlb(x) leave_mm(x) 193#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 64dc82ee19f0..331682231bb4 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -9,23 +9,20 @@ struct amd_nb_bus_dev_range {
9 u8 dev_limit; 9 u8 dev_limit;
10}; 10};
11 11
12extern struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode; 14struct bootnode;
15 15
16extern int early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
18extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 19extern int amd_numa_init(void);
20extern int amd_scan_nodes(void); 20extern int amd_get_subcaches(int);
21 21extern int amd_set_subcaches(int, int);
22#ifdef CONFIG_NUMA_EMU
23extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
24extern void amd_get_nodes(struct bootnode *nodes);
25#endif
26 22
27struct amd_northbridge { 23struct amd_northbridge {
28 struct pci_dev *misc; 24 struct pci_dev *misc;
25 struct pci_dev *link;
29}; 26};
30 27
31struct amd_northbridge_info { 28struct amd_northbridge_info {
@@ -35,17 +32,18 @@ struct amd_northbridge_info {
35}; 32};
36extern struct amd_northbridge_info amd_northbridges; 33extern struct amd_northbridge_info amd_northbridges;
37 34
38#define AMD_NB_GART 0x1 35#define AMD_NB_GART BIT(0)
39#define AMD_NB_L3_INDEX_DISABLE 0x2 36#define AMD_NB_L3_INDEX_DISABLE BIT(1)
37#define AMD_NB_L3_PARTITIONING BIT(2)
40 38
41#ifdef CONFIG_AMD_NB 39#ifdef CONFIG_AMD_NB
42 40
43static inline int amd_nb_num(void) 41static inline u16 amd_nb_num(void)
44{ 42{
45 return amd_northbridges.num; 43 return amd_northbridges.num;
46} 44}
47 45
48static inline int amd_nb_has_feature(int feature) 46static inline bool amd_nb_has_feature(unsigned feature)
49{ 47{
50 return ((amd_northbridges.flags & feature) == feature); 48 return ((amd_northbridges.flags & feature) == feature);
51} 49}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5e3969c36d7f..2b7d573be549 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_APIC_H 2#define _ASM_X86_APIC_H
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/delay.h>
6#include <linux/pm.h> 5#include <linux/pm.h>
7 6
8#include <asm/alternative.h> 7#include <asm/alternative.h>
@@ -220,7 +219,6 @@ extern void enable_IR_x2apic(void);
220 219
221extern int get_physical_broadcast(void); 220extern int get_physical_broadcast(void);
222 221
223extern void apic_disable(void);
224extern int lapic_get_maxlvt(void); 222extern int lapic_get_maxlvt(void);
225extern void clear_local_APIC(void); 223extern void clear_local_APIC(void);
226extern void connect_bsp_APIC(void); 224extern void connect_bsp_APIC(void);
@@ -228,18 +226,17 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
228extern void disable_local_APIC(void); 226extern void disable_local_APIC(void);
229extern void lapic_shutdown(void); 227extern void lapic_shutdown(void);
230extern int verify_local_APIC(void); 228extern int verify_local_APIC(void);
231extern void cache_APIC_registers(void);
232extern void sync_Arb_IDs(void); 229extern void sync_Arb_IDs(void);
233extern void init_bsp_APIC(void); 230extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 231extern void setup_local_APIC(void);
235extern void end_local_APIC_setup(void); 232extern void end_local_APIC_setup(void);
233extern void bsp_end_local_APIC_setup(void);
236extern void init_apic_mappings(void); 234extern void init_apic_mappings(void);
237void register_lapic_address(unsigned long address); 235void register_lapic_address(unsigned long address);
238extern void setup_boot_APIC_clock(void); 236extern void setup_boot_APIC_clock(void);
239extern void setup_secondary_APIC_clock(void); 237extern void setup_secondary_APIC_clock(void);
240extern int APIC_init_uniprocessor(void); 238extern int APIC_init_uniprocessor(void);
241extern void enable_NMI_through_LVT0(void); 239extern int apic_force_enable(unsigned long addr);
242extern int apic_force_enable(void);
243 240
244/* 241/*
245 * On 32bit this is mach-xxx local 242 * On 32bit this is mach-xxx local
@@ -260,7 +257,6 @@ static inline void lapic_shutdown(void) { }
260#define local_apic_timer_c2_ok 1 257#define local_apic_timer_c2_ok 1
261static inline void init_apic_mappings(void) { } 258static inline void init_apic_mappings(void) { }
262static inline void disable_local_APIC(void) { } 259static inline void disable_local_APIC(void) { }
263static inline void apic_disable(void) { }
264# define setup_boot_APIC_clock x86_init_noop 260# define setup_boot_APIC_clock x86_init_noop
265# define setup_secondary_APIC_clock x86_init_noop 261# define setup_secondary_APIC_clock x86_init_noop
266#endif /* !CONFIG_X86_LOCAL_APIC */ 262#endif /* !CONFIG_X86_LOCAL_APIC */
@@ -306,8 +302,6 @@ struct apic {
306 302
307 void (*setup_apic_routing)(void); 303 void (*setup_apic_routing)(void);
308 int (*multi_timer_check)(int apic, int irq); 304 int (*multi_timer_check)(int apic, int irq);
309 int (*apicid_to_node)(int logical_apicid);
310 int (*cpu_to_logical_apicid)(int cpu);
311 int (*cpu_present_to_apicid)(int mps_cpu); 305 int (*cpu_present_to_apicid)(int mps_cpu);
312 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); 306 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
313 void (*setup_portio_remap)(void); 307 void (*setup_portio_remap)(void);
@@ -355,6 +349,23 @@ struct apic {
355 void (*icr_write)(u32 low, u32 high); 349 void (*icr_write)(u32 low, u32 high);
356 void (*wait_icr_idle)(void); 350 void (*wait_icr_idle)(void);
357 u32 (*safe_wait_icr_idle)(void); 351 u32 (*safe_wait_icr_idle)(void);
352
353#ifdef CONFIG_X86_32
354 /*
355 * Called very early during boot from get_smp_config(). It should
356 * return the logical apicid. x86_[bios]_cpu_to_apicid is
357 * initialized before this function is called.
358 *
359 * If logical apicid can't be determined that early, the function
360 * may return BAD_APICID. Logical apicid will be configured after
361 * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
362 * won't be applied properly during early boot in this case.
363 */
364 int (*x86_32_early_logical_apicid)(int cpu);
365
366 /* determine CPU -> NUMA node mapping */
367 int (*x86_32_numa_cpu_node)(int cpu);
368#endif
358}; 369};
359 370
360/* 371/*
@@ -502,6 +513,11 @@ extern struct apic apic_noop;
502 513
503extern struct apic apic_default; 514extern struct apic apic_default;
504 515
516static inline int noop_x86_32_early_logical_apicid(int cpu)
517{
518 return BAD_APICID;
519}
520
505/* 521/*
506 * Set up the logical destination ID. 522 * Set up the logical destination ID.
507 * 523 *
@@ -521,7 +537,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
521 return cpuid_apic >> index_msb; 537 return cpuid_apic >> index_msb;
522} 538}
523 539
524extern int default_apicid_to_node(int logical_apicid); 540extern int default_x86_32_numa_cpu_node(int cpu);
525 541
526#endif 542#endif
527 543
@@ -557,12 +573,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
557 *retmap = *phys_map; 573 *retmap = *phys_map;
558} 574}
559 575
560/* Mapping from cpu number to logical apicid */
561static inline int default_cpu_to_logical_apicid(int cpu)
562{
563 return 1 << cpu;
564}
565
566static inline int __default_cpu_present_to_apicid(int mps_cpu) 576static inline int __default_cpu_present_to_apicid(int mps_cpu)
567{ 577{
568 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) 578 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -595,8 +605,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
595 605
596#endif /* CONFIG_X86_LOCAL_APIC */ 606#endif /* CONFIG_X86_LOCAL_APIC */
597 607
598#ifdef CONFIG_X86_32
599extern u8 cpu_2_logical_apicid[NR_CPUS];
600#endif
601
602#endif /* _ASM_X86_APIC_H */ 608#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 47a30ff8e517..d87988bacf3e 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -426,4 +426,16 @@ struct local_apic {
426#else 426#else
427 #define BAD_APICID 0xFFFFu 427 #define BAD_APICID 0xFFFFu
428#endif 428#endif
429
430enum ioapic_irq_destination_types {
431 dest_Fixed = 0,
432 dest_LowestPrio = 1,
433 dest_SMI = 2,
434 dest__reserved_1 = 3,
435 dest_NMI = 4,
436 dest_INIT = 5,
437 dest__reserved_2 = 6,
438 dest_ExtINT = 7
439};
440
429#endif /* _ASM_X86_APICDEF_H */ 441#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 903683b07e42..69d58131bc8e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -456,14 +456,12 @@ static inline int fls(int x)
456 456
457#ifdef __KERNEL__ 457#ifdef __KERNEL__
458 458
459#include <asm-generic/bitops/ext2-non-atomic.h> 459#include <asm-generic/bitops/le.h>
460 460
461#define ext2_set_bit_atomic(lock, nr, addr) \ 461#define ext2_set_bit_atomic(lock, nr, addr) \
462 test_and_set_bit((nr), (unsigned long *)(addr)) 462 test_and_set_bit((nr), (unsigned long *)(addr))
463#define ext2_clear_bit_atomic(lock, nr, addr) \ 463#define ext2_clear_bit_atomic(lock, nr, addr) \
464 test_and_clear_bit((nr), (unsigned long *)(addr)) 464 test_and_clear_bit((nr), (unsigned long *)(addr))
465 465
466#include <asm-generic/bitops/minix.h>
467
468#endif /* __KERNEL__ */ 466#endif /* __KERNEL__ */
469#endif /* _ASM_X86_BITOPS_H */ 467#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index c8bfe63a06de..e020d88ec02d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
12/* setup data types */ 12/* setup data types */
13#define SETUP_NONE 0 13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1 14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
15 16
16/* extensible setup data list node */ 17/* extensible setup data list node */
17struct setup_data { 18struct setup_data {
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075c..4e12668711e5 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
1#ifndef _ASM_X86_CACHEFLUSH_H 1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H 2#define _ASM_X86_CACHEFLUSH_H
3 3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */ 4/* Caches aren't brain-dead on the intel. */
8static inline void flush_cache_all(void) { } 5#include <asm-generic/cacheflush.h>
9static inline void flush_cache_mm(struct mm_struct *mm) { }
10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
16static inline void flush_dcache_page(struct page *page) { }
17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
19static inline void flush_icache_range(unsigned long start,
20 unsigned long end) { }
21static inline void flush_icache_page(struct vm_area_struct *vma,
22 struct page *page) { }
23static inline void flush_icache_user_range(struct vm_area_struct *vma,
24 struct page *page,
25 unsigned long addr,
26 unsigned long len) { }
27static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
28static inline void flush_cache_vunmap(unsigned long start,
29 unsigned long end) { }
30
31static inline void copy_to_user_page(struct vm_area_struct *vma,
32 struct page *page, unsigned long vaddr,
33 void *dst, const void *src,
34 unsigned long len)
35{
36 memcpy(dst, src, len);
37}
38
39static inline void copy_from_user_page(struct vm_area_struct *vma,
40 struct page *page, unsigned long vaddr,
41 void *dst, const void *src,
42 unsigned long len)
43{
44 memcpy(dst, src, len);
45}
46 6
47#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
48/* 8/*
@@ -111,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
111 * Read/Write : ReadOnly, ReadWrite 71 * Read/Write : ReadOnly, ReadWrite
112 * Presence : NotPresent 72 * Presence : NotPresent
113 * 73 *
114 * Within a catagory, the attributes are mutually exclusive. 74 * Within a category, the attributes are mutually exclusive.
115 * 75 *
116 * The implementation of this API will take care of various aspects that 76 * The implementation of this API will take care of various aspects that
117 * are associated with changing such attributes, such as: 77 * are associated with changing such attributes, such as:
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
new file mode 100644
index 000000000000..e656ad8c0a2e
--- /dev/null
+++ b/arch/x86/include/asm/ce4100.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_CE4100_H_
2#define _ASM_CE4100_H_
3
4int ce4100_pci_init(void);
5
6#endif
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4fab24de26b1..4564c8e28a33 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int);
32 32
33DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34 34
35int mwait_usable(const struct cpuinfo_x86 *);
35 36
36#endif /* _ASM_X86_CPU_H */ 37#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 220e2ea08e80..91f3e087cf21 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -160,6 +160,7 @@
160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ 161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ 162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
163#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
163 164
164/* 165/*
165 * Auxiliary flags: Linux defined - For features scattered in various 166 * Auxiliary flags: Linux defined - For features scattered in various
@@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32];
279#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 280#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
280#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 281#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
281#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 282#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
283#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
282 284
283#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 285#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
284# define cpu_has_invlpg 1 286# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..057099e5faba 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -10,7 +10,6 @@
10 10
11#include <linux/spinlock.h> /* And spinlocks */ 11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */ 12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14 13
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER 14#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p 15#define dma_outb outb_p
@@ -151,6 +150,7 @@
151#define DMA_AUTOINIT 0x10 150#define DMA_AUTOINIT 0x10
152 151
153 152
153#ifdef CONFIG_ISA_DMA_API
154extern spinlock_t dma_spin_lock; 154extern spinlock_t dma_spin_lock;
155 155
156static inline unsigned long claim_dma_lock(void) 156static inline unsigned long claim_dma_lock(void)
@@ -164,6 +164,7 @@ static inline void release_dma_lock(unsigned long flags)
164{ 164{
165 spin_unlock_irqrestore(&dma_spin_lock, flags); 165 spin_unlock_irqrestore(&dma_spin_lock, flags);
166} 166}
167#endif /* CONFIG_ISA_DMA_API */
167 168
168/* enable/disable a specific DMA channel */ 169/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr) 170static inline void enable_dma(unsigned int dmanr)
@@ -303,9 +304,11 @@ static inline int get_dma_residue(unsigned int dmanr)
303} 304}
304 305
305 306
306/* These are in kernel/dma.c: */ 307/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
308#ifdef CONFIG_ISA_DMA_API
307extern int request_dma(unsigned int dmanr, const char *device_id); 309extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr); 310extern void free_dma(unsigned int dmanr);
311#endif
309 312
310/* From PCI */ 313/* From PCI */
311 314
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index e99d55d74df5..908b96957d88 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -96,7 +96,7 @@ extern void e820_setup_gap(void);
96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
97 unsigned long start_addr, unsigned long long end_addr); 97 unsigned long start_addr, unsigned long long end_addr);
98struct setup_data; 98struct setup_data;
99extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); 99extern void parse_e820_ext(struct setup_data *data);
100 100
101#if defined(CONFIG_X86_64) || \ 101#if defined(CONFIG_X86_64) || \
102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 57650ab4a5f5..1cd6d26a0a8d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18 18
19.irpc idx, "01234567" 19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
20BUILD_INTERRUPT3(invalidate_interrupt\idx, 22BUILD_INTERRUPT3(invalidate_interrupt\idx,
21 (INVALIDATE_TLB_VECTOR_START)+\idx, 23 (INVALIDATE_TLB_VECTOR_START)+\idx,
22 smp_invalidate_interrupt) 24 smp_invalidate_interrupt)
25.endif
23.endr 26.endr
24#endif 27#endif
25 28
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e1..2c6fc9e62812 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
7 frame pointer later */ 7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 9 .macro FRAME
10 pushl %ebp 10 pushl_cfi %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0 11 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp 12 movl %esp,%ebp
14 .endm 13 .endm
15 .macro ENDFRAME 14 .macro ENDFRAME
16 popl %ebp 15 popl_cfi %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp 16 CFI_RESTORE ebp
19 .endm 17 .endm
20#else 18#else
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e956..d09bb03653f0 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
37 "+m" (*uaddr), "=&r" (tem) \ 37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0)) 38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39 39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 40static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
41{ 41{
42 int op = (encoded_op >> 28) & 7; 42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15; 43 int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg; 49 oparg = 1 << oparg;
50 50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
52 return -EFAULT; 52 return -EFAULT;
53 53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
109 return ret; 109 return ret;
110} 110}
111 111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 112static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 int newval) 113 u32 oldval, u32 newval)
114{ 114{
115 int ret = 0;
115 116
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 117#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */ 118 /* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
119 return -ENOSYS; 120 return -ENOSYS;
120#endif 121#endif
121 122
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 123 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
123 return -EFAULT; 124 return -EFAULT;
124 125
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" 126 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
126 "2:\t.section .fixup, \"ax\"\n" 127 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n" 128 "3:\tmov %3, %0\n"
128 "\tjmp 2b\n" 129 "\tjmp 2b\n"
129 "\t.previous\n" 130 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b) 131 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr) 132 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval) 133 : "i" (-EFAULT), "r" (newval), "1" (oldval)
133 : "memory" 134 : "memory"
134 ); 135 );
135 136
136 return oldval; 137 *uval = oldval;
138 return ret;
137} 139}
138 140
139#endif 141#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 43085bfc99c3..156cd5d18d2a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
66 * Don't enable translation but enable GART IO and CPU accesses. 66 * Don't enable translation but enable GART IO and CPU accesses.
67 * Also, set DISTLBWALKPRB since GART tables memory is UC. 67 * Also, set DISTLBWALKPRB since GART tables memory is UC.
68 */ 68 */
69 ctl = DISTLBWALKPRB | order << 1; 69 ctl = order << 1;
70 70
71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
72} 72}
@@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
75{ 75{
76 u32 tmp, ctl; 76 u32 tmp, ctl;
77 77
78 /* address of the mappings table */ 78 /* address of the mappings table */
79 addr >>= 12; 79 addr >>= 12;
80 tmp = (u32) addr<<4; 80 tmp = (u32) addr<<4;
81 tmp &= ~0xf; 81 tmp &= ~0xf;
82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); 82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
83 83
84 /* Enable GART translation for this hammer. */ 84 /* Enable GART translation for this hammer. */
85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
86 ctl |= GARTEN; 86 ctl |= GARTEN | DISTLBWALKPRB;
87 ctl &= ~(DISGARTCPU | DISGARTIO); 87 ctl &= ~(DISGARTCPU | DISGARTIO);
88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
89} 89}
90 90
91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) 91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 0274ec5a7e62..bb9efe8706e2 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void);
48 72
49extern void irq_move_cleanup_interrupt(void); 73extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void); 74extern void reboot_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ef328901c802..c9e09ea05644 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -237,7 +237,7 @@ static inline void fpu_save_init(struct fpu *fpu)
237 } else if (use_fxsr()) { 237 } else if (use_fxsr()) {
238 fpu_fxsave(fpu); 238 fpu_fxsave(fpu);
239 } else { 239 } else {
240 asm volatile("fsave %[fx]; fwait" 240 asm volatile("fnsave %[fx]; fwait"
241 : [fx] "=m" (fpu->state->fsave)); 241 : [fx] "=m" (fpu->state->fsave));
242 return; 242 return;
243 } 243 }
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a5109..8dbe353e41e1 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
11 unsigned long page_size_mask); 11 unsigned long page_size_mask);
12 12
13 13
14extern unsigned long __initdata e820_table_start; 14extern unsigned long __initdata pgt_buf_start;
15extern unsigned long __meminitdata e820_table_end; 15extern unsigned long __meminitdata pgt_buf_end;
16extern unsigned long __meminitdata e820_table_top; 16extern unsigned long __meminitdata pgt_buf_top;
17 17
18#endif /* _ASM_X86_INIT_32_H */ 18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index f327d386d6cc..c4bd267dfc50 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
63 } __attribute__ ((packed)) bits; 63 } __attribute__ ((packed)) bits;
64}; 64};
65 65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry { 66struct IO_APIC_route_entry {
78 __u32 vector : 8, 67 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED 68 delivery_mode : 3, /* 000: FIXED
@@ -106,6 +95,10 @@ struct IR_IO_APIC_route_entry {
106 index : 15; 95 index : 15;
107} __attribute__ ((packed)); 96} __attribute__ ((packed));
108 97
98#define IOAPIC_AUTO -1
99#define IOAPIC_EDGE 0
100#define IOAPIC_LEVEL 1
101
109#ifdef CONFIG_X86_IO_APIC 102#ifdef CONFIG_X86_IO_APIC
110 103
111/* 104/*
@@ -150,11 +143,6 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 143#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 144 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 145
153extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic);
157
158struct io_apic_irq_attr; 146struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 147extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 148 struct io_apic_irq_attr *irq_attr);
@@ -162,6 +150,8 @@ void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_and_gsi_init(void); 150extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
164 152
153int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -186,6 +176,8 @@ extern void __init pre_init_apic_IRQ0(void);
186 176
187extern void mp_save_irq(struct mpc_intsrc *m); 177extern void mp_save_irq(struct mpc_intsrc *m);
188 178
179extern void disable_ioapic_support(void);
180
189#else /* !CONFIG_X86_IO_APIC */ 181#else /* !CONFIG_X86_IO_APIC */
190 182
191#define io_apic_assign_pci_irqs 0 183#define io_apic_assign_pci_irqs 0
@@ -199,6 +191,26 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
199struct io_apic_irq_attr; 191struct io_apic_irq_attr;
200static inline int io_apic_set_pci_routing(struct device *dev, int irq, 192static inline int io_apic_set_pci_routing(struct device *dev, int irq,
201 struct io_apic_irq_attr *irq_attr) { return 0; } 193 struct io_apic_irq_attr *irq_attr) { return 0; }
194
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{
203 return -ENOMEM;
204}
205
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
208{
209 return -ENOMEM;
210}
211
212static inline void mp_save_irq(struct mpc_intsrc *m) { };
213static inline void disable_ioapic_support(void) { }
202#endif 214#endif
203 215
204#endif /* _ASM_X86_IO_APIC_H */ 216#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b7228268a63..615fa9061b57 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
123 int vector); 123 int vector);
124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
125 int vector); 125 int vector);
126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
130 126
131/* Avoid include hell */ 127/* Avoid include hell */
132#define NMI_VECTOR 0x02 128#define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
150} 146}
151 147
152#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
149extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
150 int vector);
151extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
152 int vector);
153extern void default_send_IPI_mask_logical(const struct cpumask *mask, 153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector); 154 int vector);
155extern void default_send_IPI_allbutself(int vector); 155extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index c704b38c57a2..ba870bb6dd8e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,9 +10,6 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
16static inline int irq_canonicalize(int irq) 13static inline int irq_canonicalize(int irq)
17{ 14{
18 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 000000000000..423bbbddf36d
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
1#ifndef __IRQ_CONTROLLER__
2#define __IRQ_CONTROLLER__
3
4struct irq_domain {
5 int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
6 u32 *out_hwirq, u32 *out_type);
7 void *priv;
8 struct device_node *controller;
9 struct list_head l;
10};
11
12#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6af0894dafb4..6e976ee3b3ef 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h>
4/* 5/*
5 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
6 * 7 *
@@ -16,8 +17,8 @@
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
20 * Vectors 238 ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
21 * 22 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 * 24 *
@@ -96,37 +97,43 @@
96#define THRESHOLD_APIC_VECTOR 0xf9 97#define THRESHOLD_APIC_VECTOR 0xf9
97#define REBOOT_VECTOR 0xf8 98#define REBOOT_VECTOR 0xf8
98 99
99/* f0-f7 used for spreading out TLB flushes: */
100#define INVALIDATE_TLB_VECTOR_END 0xf7
101#define INVALIDATE_TLB_VECTOR_START 0xf0
102#define NUM_INVALIDATE_TLB_VECTORS 8
103
104/*
105 * Local APIC timer IRQ vector is on a different priority level,
106 * to work around the 'lost local interrupt if more than 2 IRQ
107 * sources per level' errata.
108 */
109#define LOCAL_TIMER_VECTOR 0xef
110
111/* 100/*
112 * Generic system vector for platform specific use 101 * Generic system vector for platform specific use
113 */ 102 */
114#define X86_PLATFORM_IPI_VECTOR 0xed 103#define X86_PLATFORM_IPI_VECTOR 0xf7
115 104
116/* 105/*
117 * IRQ work vector: 106 * IRQ work vector:
118 */ 107 */
119#define IRQ_WORK_VECTOR 0xec 108#define IRQ_WORK_VECTOR 0xf6
120 109
121#define UV_BAU_MESSAGE 0xea 110#define UV_BAU_MESSAGE 0xf5
122 111
123/* 112/*
124 * Self IPI vector for machine checks 113 * Self IPI vector for machine checks
125 */ 114 */
126#define MCE_SELF_VECTOR 0xeb 115#define MCE_SELF_VECTOR 0xf4
127 116
128/* Xen vector callback to receive events in a HVM domain */ 117/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9 118#define XEN_HVM_EVTCHN_CALLBACK 0xf3
119
120/*
121 * Local APIC timer IRQ vector is on a different priority level,
122 * to work around the 'lost local interrupt if more than 2 IRQ
123 * sources per level' errata.
124 */
125#define LOCAL_TIMER_VECTOR 0xef
126
127/* up to 32 vectors used for spreading out TLB flushes: */
128#if NR_CPUS <= 32
129# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
130#else
131# define NUM_INVALIDATE_TLB_VECTORS (32)
132#endif
133
134#define INVALIDATE_TLB_VECTOR_END (0xee)
135#define INVALIDATE_TLB_VECTOR_START \
136 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
130 137
131#define NR_VECTORS 256 138#define NR_VECTORS 256
132 139
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f52d42e80585..574dbc22893a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
14 do { \ 14 do { \
15 asm goto("1:" \ 15 asm goto("1:" \
16 JUMP_LABEL_INITIAL_NOP \ 16 JUMP_LABEL_INITIAL_NOP \
17 ".pushsection __jump_table, \"a\" \n\t"\ 17 ".pushsection __jump_table, \"aw\" \n\t"\
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ 18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
19 ".popsection \n\t" \ 19 ".popsection \n\t" \
20 : : "i" (key) : : label); \ 20 : : "i" (key) : : label); \
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index ca242d35e873..fe2cc6e105fa 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,7 +13,6 @@ enum die_val {
13 DIE_PANIC, 13 DIE_PANIC,
14 DIE_NMI, 14 DIE_NMI,
15 DIE_DIE, 15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG, 16 DIE_KERNELDEBUG,
18 DIE_TRAP, 17 DIE_TRAP,
19 DIE_GPF, 18 DIE_GPF,
@@ -27,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
27extern int __must_check __die(const char *, struct pt_regs *, long); 26extern int __must_check __die(const char *, struct pt_regs *, long);
28extern void show_registers(struct pt_regs *regs); 27extern void show_registers(struct pt_regs *regs);
29extern void show_trace(struct task_struct *t, struct pt_regs *regs, 28extern void show_trace(struct task_struct *t, struct pt_regs *regs,
30 unsigned long *sp); 29 unsigned long *sp, unsigned long bp);
31extern void __show_regs(struct pt_regs *regs, int all); 30extern void __show_regs(struct pt_regs *regs, int all);
32extern void show_regs(struct pt_regs *regs); 31extern void show_regs(struct pt_regs *regs);
33extern unsigned long oops_begin(void); 32extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8e37deb1eb38..0f5213564326 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 142 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
143 unsigned int count, struct kvm_vcpu *vcpu); 143 unsigned int count, struct kvm_vcpu *vcpu);
144 144
145 bool (*get_cached_descriptor)(struct desc_struct *desc, 145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
146 int seg, struct kvm_vcpu *vcpu); 146 int seg, struct kvm_vcpu *vcpu);
147 void (*set_cached_descriptor)(struct desc_struct *desc, 147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
148 int seg, struct kvm_vcpu *vcpu); 148 int seg, struct kvm_vcpu *vcpu);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
239 int interruptibility; 239 int interruptibility;
240 240
241 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn;
242 243
243 bool have_exception; 244 bool have_exception;
244 struct x86_exception exception; 245 struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d29187..c8af0991fdf0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
85 85
86#define ASYNC_PF_PER_VCPU 64 86#define ASYNC_PF_PER_VCPU 64
87 87
88extern spinlock_t kvm_lock; 88extern raw_spinlock_t kvm_lock;
89extern struct list_head vm_list; 89extern struct list_head vm_list;
90 90
91struct kvm_vcpu; 91struct kvm_vcpu;
@@ -255,6 +255,8 @@ struct kvm_mmu {
255 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
256 struct kvm_mmu_page *sp); 256 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq);
258 hpa_t root_hpa; 260 hpa_t root_hpa;
259 int root_level; 261 int root_level;
260 int shadow_root_level; 262 int shadow_root_level;
@@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
335 u64 *last_pte_updated; 337 u64 *last_pte_updated;
336 gfn_t last_pte_gfn; 338 gfn_t last_pte_gfn;
337 339
338 struct {
339 gfn_t gfn; /* presumed gfn during guest pte update */
340 pfn_t pfn; /* pfn corresponding to that gfn */
341 unsigned long mmu_seq;
342 } update_pte;
343
344 struct fpu guest_fpu; 340 struct fpu guest_fpu;
345 u64 xcr0; 341 u64 xcr0;
346 342
@@ -448,7 +444,7 @@ struct kvm_arch {
448 444
449 unsigned long irq_sources_bitmap; 445 unsigned long irq_sources_bitmap;
450 s64 kvmclock_offset; 446 s64 kvmclock_offset;
451 spinlock_t tsc_write_lock; 447 raw_spinlock_t tsc_write_lock;
452 u64 last_tsc_nsec; 448 u64 last_tsc_nsec;
453 u64 last_tsc_offset; 449 u64 last_tsc_offset;
454 u64 last_tsc_write; 450 u64 last_tsc_write;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea5..aeff3e89b222 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -13,6 +13,12 @@ typedef struct {
13 int size; 13 int size;
14 struct mutex lock; 14 struct mutex lock;
15 void *vdso; 15 void *vdso;
16
17#ifdef CONFIG_X86_64
18 /* True if mm supports a task running in 32 bit compatibility mode. */
19 unsigned short ia32_compat;
20#endif
21
16} mm_context_t; 22} mm_context_t;
17 23
18#ifdef CONFIG_SMP 24#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4a2d4e0c18d9..8b5393ec1080 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
36 unsigned cpu = smp_processor_id(); 36 unsigned cpu = smp_processor_id();
37 37
38 if (likely(prev != next)) { 38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpumask_clear_cpu(cpu, mm_cpumask(prev));
41#ifdef CONFIG_SMP 39#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 40 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next); 41 percpu_write(cpu_tlbstate.active_mm, next);
@@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
47 /* Re-load page tables */ 45 /* Re-load page tables */
48 load_cr3(next->pgd); 46 load_cr3(next->pgd);
49 47
48 /* stop flush ipis for the previous mm */
49 cpumask_clear_cpu(cpu, mm_cpumask(prev));
50
50 /* 51 /*
51 * load the LDT, if the LDT is different: 52 * load the LDT, if the LDT is different:
52 */ 53 */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 0c90dd9f0505..9c7d95f6174b 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -25,7 +25,6 @@ extern int pic_mode;
25#define MAX_IRQ_SOURCES 256 25#define MAX_IRQ_SOURCES 256
26 26
27extern unsigned int def_to_bigsmp; 27extern unsigned int def_to_bigsmp;
28extern u8 apicid_2_node[];
29 28
30#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
31extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
33extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
34#endif 33#endif
35 34
36#define MAX_APICID 256
37
38#else /* CONFIG_X86_64: */ 35#else /* CONFIG_X86_64: */
39 36
40#define MAX_MP_BUSSES 256 37#define MAX_MP_BUSSES 256
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4d0dfa0d998e..3cce71413d0b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,14 @@
36#define MSR_IA32_PERFCTR1 0x000000c2 36#define MSR_IA32_PERFCTR1 0x000000c2
37#define MSR_FSB_FREQ 0x000000cd 37#define MSR_FSB_FREQ 0x000000cd
38 38
39#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
40#define NHM_C3_AUTO_DEMOTE (1UL << 25)
41#define NHM_C1_AUTO_DEMOTE (1UL << 26)
42#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
43
39#define MSR_MTRRcap 0x000000fe 44#define MSR_MTRRcap 0x000000fe
40#define MSR_IA32_BBL_CR_CTL 0x00000119 45#define MSR_IA32_BBL_CR_CTL 0x00000119
46#define MSR_IA32_BBL_CR_CTL3 0x0000011e
41 47
42#define MSR_IA32_SYSENTER_CS 0x00000174 48#define MSR_IA32_SYSENTER_CS 0x00000174
43#define MSR_IA32_SYSENTER_ESP 0x00000175 49#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -47,6 +53,9 @@
47#define MSR_IA32_MCG_STATUS 0x0000017a 53#define MSR_IA32_MCG_STATUS 0x0000017a
48#define MSR_IA32_MCG_CTL 0x0000017b 54#define MSR_IA32_MCG_CTL 0x0000017b
49 55
56#define MSR_OFFCORE_RSP_0 0x000001a6
57#define MSR_OFFCORE_RSP_1 0x000001a7
58
50#define MSR_IA32_PEBS_ENABLE 0x000003f1 59#define MSR_IA32_PEBS_ENABLE 0x000003f1
51#define MSR_IA32_DS_AREA 0x00000600 60#define MSR_IA32_DS_AREA 0x00000600
52#define MSR_IA32_PERF_CAPABILITIES 0x00000345 61#define MSR_IA32_PERF_CAPABILITIES 0x00000345
@@ -87,11 +96,15 @@
87#define MSR_IA32_MC0_ADDR 0x00000402 96#define MSR_IA32_MC0_ADDR 0x00000402
88#define MSR_IA32_MC0_MISC 0x00000403 97#define MSR_IA32_MC0_MISC 0x00000403
89 98
99#define MSR_AMD64_MC0_MASK 0xc0010044
100
90#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) 101#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
91#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) 102#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
92#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) 103#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
93#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) 104#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
94 105
106#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
107
95/* These are consecutive and not in the normal 4er MCE bank block */ 108/* These are consecutive and not in the normal 4er MCE bank block */
96#define MSR_IA32_MC0_CTL2 0x00000280 109#define MSR_IA32_MC0_CTL2 0x00000280
97#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 110#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c76f5b92b840..4886a68f267e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,7 +7,6 @@
7 7
8#ifdef CONFIG_X86_LOCAL_APIC 8#ifdef CONFIG_X86_LOCAL_APIC
9 9
10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 10extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
12extern int reserve_perfctr_nmi(unsigned int); 11extern int reserve_perfctr_nmi(unsigned int);
13extern void release_perfctr_nmi(unsigned int); 12extern void release_perfctr_nmi(unsigned int);
@@ -30,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void);
30 * external nmis, because the local ones are more frequent. 29 * external nmis, because the local ones are more frequent.
31 * 30 *
32 * Also setup some default high/normal/low settings for 31 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate 32 * subsystems to registers with. Using 4 bits to separate
34 * the priorities. This can go alot higher if needed be. 33 * the priorities. This can go a lot higher if needed be.
35 */ 34 */
36 35
37#define NMI_LOCAL_SHIFT 16 /* randomly picked */ 36#define NMI_LOCAL_SHIFT 16 /* randomly picked */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 6d8723a766cc..af788496020b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -38,7 +38,7 @@
38#define K8_NOP8 K8_NOP4 K8_NOP4 38#define K8_NOP8 K8_NOP4 K8_NOP4
39 39
40/* K7 nops 40/* K7 nops
41 uses eax dependencies (arbitary choice) 41 uses eax dependencies (arbitrary choice)
42 1: nop 42 1: nop
43 2: movl %eax,%eax 43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax 44 3: leal (,%eax,1),%eax
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d3138..3d4dab43c994 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,57 @@
1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H
3
4#include <asm/topology.h>
5#include <asm/apicdef.h>
6
7#ifdef CONFIG_NUMA
8
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
10
11/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and
13 * node and is used to initialize cpu_to_node mapping.
14 *
15 * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
16 * should be accessed by the accessors - set_apicid_to_node() and
17 * numa_cpu_node().
18 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC];
20
21static inline void set_apicid_to_node(int apicid, s16 node)
22{
23 __apicid_to_node[apicid] = node;
24}
25#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node)
27{
28}
29#endif /* CONFIG_NUMA */
30
1#ifdef CONFIG_X86_32 31#ifdef CONFIG_X86_32
2# include "numa_32.h" 32# include "numa_32.h"
3#else 33#else
4# include "numa_64.h" 34# include "numa_64.h"
5#endif 35#endif
36
37#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { }
51#endif /* CONFIG_NUMA */
52
53#ifdef CONFIG_DEBUG_PER_CPU_MAPS
54struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
55#endif
56
57#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..c6beed1ef103 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,8 +1,15 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
4extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
6 13
7#ifdef CONFIG_HIGHMEM 14#ifdef CONFIG_HIGHMEM
8extern void set_highmem_pages_init(void); 15extern void set_highmem_pages_init(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 5ae87285a502..344eb1790b46 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -2,23 +2,16 @@
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h> 4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6 5
7struct bootnode { 6struct bootnode {
8 u64 start; 7 u64 start;
9 u64 end; 8 u64 end;
10}; 9};
11 10
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16 12
17extern void numa_init_array(void);
18extern int numa_off; 13extern int numa_off;
19 14
20extern s16 apicid_to_node[MAX_LOCAL_APIC];
21
22extern unsigned long numa_free_all_bootmem(void); 15extern unsigned long numa_free_all_bootmem(void);
23extern void setup_node_bootmem(int nodeid, unsigned long start, 16extern void setup_node_bootmem(int nodeid, unsigned long start,
24 unsigned long end); 17 unsigned long end);
@@ -31,22 +24,19 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
31 */ 24 */
32#define NODE_MIN_SIZE (4*1024*1024) 25#define NODE_MIN_SIZE (4*1024*1024)
33 26
34extern void __init init_cpu_to_node(void); 27extern nodemask_t numa_nodes_parsed __initdata;
35extern void __cpuinit numa_set_node(int cpu, int node); 28
36extern void __cpuinit numa_clear_node(int cpu); 29extern int __cpuinit numa_cpu_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
38extern void __cpuinit numa_remove_cpu(int cpu); 31extern void __init numa_set_distance(int from, int to, int distance);
39 32
40#ifdef CONFIG_NUMA_EMU 33#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 35#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
36void numa_emu_cmdline(char *);
43#endif /* CONFIG_NUMA_EMU */ 37#endif /* CONFIG_NUMA_EMU */
44#else 38#else
45static inline void init_cpu_to_node(void) { } 39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
46static inline void numa_set_node(int cpu, int node) { }
47static inline void numa_clear_node(int cpu) { }
48static inline void numa_add_cpu(int cpu, int node) { }
49static inline void numa_remove_cpu(int cpu) { }
50#endif 40#endif
51 41
52#endif /* _ASM_X86_NUMA_64_H */ 42#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index f482010350fb..5ca6801b75f3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
20 20
21/* 21/*
22 * OLPC board IDs contain the major build number within the mask 0x0ff0, 22 * OLPC board IDs contain the major build number within the mask 0x0ff0,
23 * and the minor build number withing 0x000f. Pre-builds have a minor 23 * and the minor build number within 0x000f. Pre-builds have a minor
24 * number less than 8, and normal builds start at 8. For example, 0x0B10 24 * number less than 8, and normal builds start at 8. For example, 0x0B10
25 * is a PreB1, and 0x0C18 is a C1. 25 * is a PreB1, and 0x0C18 is a C1.
26 */ 26 */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 641988efe063..c5d3a5abbb9f 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,7 +6,7 @@
6 6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ 7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC
10 10
11extern bool olpc_ofw_is_installed(void); 11extern bool olpc_ofw_is_installed(void);
12 12
@@ -26,19 +26,15 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC */
30
31static inline bool olpc_ofw_is_installed(void) { return false; }
32static inline void olpc_ofw_detect(void) { } 30static inline void olpc_ofw_detect(void) { }
33static inline void setup_olpc_ofw_pgd(void) { } 31static inline void setup_olpc_ofw_pgd(void) { }
34static inline bool olpc_ofw_present(void) { return false; } 32#endif /* !CONFIG_OLPC */
35
36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
37 33
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT 34#ifdef CONFIG_OF_PROMTREE
39extern void olpc_dt_build_devicetree(void); 35extern void olpc_dt_build_devicetree(void);
40#else 36#else
41static inline void olpc_dt_build_devicetree(void) { } 37static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ 38#endif
43 39
44#endif /* _ASM_X86_OLPC_OFW_H */ 40#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1df66211fd1b..bce688d54c12 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAGE_DEFS_H 2#define _ASM_X86_PAGE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h>
5 6
6/* PAGE_SHIFT determines the page size */ 7/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12 8#define PAGE_SHIFT 12
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr);
45extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
46extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
47 48
49static inline phys_addr_t get_max_mapped(void)
50{
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52}
53
48extern unsigned long init_memory_mapping(unsigned long start, 54extern unsigned long init_memory_mapping(unsigned long start,
49 unsigned long end); 55 unsigned long end);
50 56
51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, 57extern void initmem_init(void);
52 int acpi, int k8);
53extern void free_initmem(void); 58extern void free_initmem(void);
54 59
55#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2071a8b2b32f..ebbc4d8ab170 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -558,13 +558,12 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, 558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd) 559 pmd_t *pmdp, pmd_t pmd)
560{ 560{
561#if PAGETABLE_LEVELS >= 3
562 if (sizeof(pmdval_t) > sizeof(long)) 561 if (sizeof(pmdval_t) > sizeof(long))
563 /* 5 arg words */ 562 /* 5 arg words */
564 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); 563 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
565 else 564 else
566 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd); 565 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
567#endif 566 native_pmd_val(pmd));
568} 567}
569#endif 568#endif
570 569
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 8ee45167e817..d475b4398d8b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,7 +45,7 @@
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46 46
47#ifdef CONFIG_SMP 47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 48#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
49#define __my_cpu_offset percpu_read(this_cpu_off) 49#define __my_cpu_offset percpu_read(this_cpu_off)
50 50
51/* 51/*
@@ -62,9 +62,11 @@
62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ 62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
63}) 63})
64#else 64#else
65#define __percpu_arg(x) "%P" #x 65#define __percpu_prefix ""
66#endif 66#endif
67 67
68#define __percpu_arg(x) __percpu_prefix "%P" #x
69
68/* 70/*
69 * Initialized pointers to per-cpu variables needed for the boot 71 * Initialized pointers to per-cpu variables needed for the boot
70 * processor need to use these macros to get the proper address 72 * processor need to use these macros to get the proper address
@@ -273,34 +275,34 @@ do { \
273 typeof(var) pxo_new__ = (nval); \ 275 typeof(var) pxo_new__ = (nval); \
274 switch (sizeof(var)) { \ 276 switch (sizeof(var)) { \
275 case 1: \ 277 case 1: \
276 asm("\n1:mov "__percpu_arg(1)",%%al" \ 278 asm("\n\tmov "__percpu_arg(1)",%%al" \
277 "\n\tcmpxchgb %2, "__percpu_arg(1) \ 279 "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
278 "\n\tjnz 1b" \ 280 "\n\tjnz 1b" \
279 : "=a" (pxo_ret__), "+m" (var) \ 281 : "=&a" (pxo_ret__), "+m" (var) \
280 : "q" (pxo_new__) \ 282 : "q" (pxo_new__) \
281 : "memory"); \ 283 : "memory"); \
282 break; \ 284 break; \
283 case 2: \ 285 case 2: \
284 asm("\n1:mov "__percpu_arg(1)",%%ax" \ 286 asm("\n\tmov "__percpu_arg(1)",%%ax" \
285 "\n\tcmpxchgw %2, "__percpu_arg(1) \ 287 "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
286 "\n\tjnz 1b" \ 288 "\n\tjnz 1b" \
287 : "=a" (pxo_ret__), "+m" (var) \ 289 : "=&a" (pxo_ret__), "+m" (var) \
288 : "r" (pxo_new__) \ 290 : "r" (pxo_new__) \
289 : "memory"); \ 291 : "memory"); \
290 break; \ 292 break; \
291 case 4: \ 293 case 4: \
292 asm("\n1:mov "__percpu_arg(1)",%%eax" \ 294 asm("\n\tmov "__percpu_arg(1)",%%eax" \
293 "\n\tcmpxchgl %2, "__percpu_arg(1) \ 295 "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
294 "\n\tjnz 1b" \ 296 "\n\tjnz 1b" \
295 : "=a" (pxo_ret__), "+m" (var) \ 297 : "=&a" (pxo_ret__), "+m" (var) \
296 : "r" (pxo_new__) \ 298 : "r" (pxo_new__) \
297 : "memory"); \ 299 : "memory"); \
298 break; \ 300 break; \
299 case 8: \ 301 case 8: \
300 asm("\n1:mov "__percpu_arg(1)",%%rax" \ 302 asm("\n\tmov "__percpu_arg(1)",%%rax" \
301 "\n\tcmpxchgq %2, "__percpu_arg(1) \ 303 "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
302 "\n\tjnz 1b" \ 304 "\n\tjnz 1b" \
303 : "=a" (pxo_ret__), "+m" (var) \ 305 : "=&a" (pxo_ret__), "+m" (var) \
304 : "r" (pxo_new__) \ 306 : "r" (pxo_new__) \
305 : "memory"); \ 307 : "memory"); \
306 break; \ 308 break; \
@@ -414,8 +416,6 @@ do { \
414#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) 416#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 417#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
416#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 418#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
417#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
418#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
419 419
420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -432,8 +432,6 @@ do { \
432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) 432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
435#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
436#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
437 435
438#ifndef CONFIG_M386 436#ifndef CONFIG_M386
439#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 437#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
@@ -455,6 +453,26 @@ do { \
455#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 453#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
456#endif /* !CONFIG_M386 */ 454#endif /* !CONFIG_M386 */
457 455
456#ifdef CONFIG_X86_CMPXCHG64
457#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
458({ \
459 char __ret; \
460 typeof(o1) __o1 = o1; \
461 typeof(o1) __n1 = n1; \
462 typeof(o2) __o2 = o2; \
463 typeof(o2) __n2 = n2; \
464 typeof(o2) __dummy = n2; \
465 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
466 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
467 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
468 __ret; \
469})
470
471#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
473#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
474#endif /* CONFIG_X86_CMPXCHG64 */
475
458/* 476/*
459 * Per cpu atomic 64 bit operations are only available under 64 bit. 477 * Per cpu atomic 64 bit operations are only available under 64 bit.
460 * 32 bit must fall back to generic operations. 478 * 32 bit must fall back to generic operations.
@@ -475,11 +493,43 @@ do { \
475#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 493#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
476#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 494#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
477#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) 495#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
496#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
497#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
478 498
479#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 499#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
480#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 500#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
481#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 501#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
482#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 502#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
503#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
504#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
505
506/*
507 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
508 * is not supported on early AMD64 processors so we must be able to emulate
509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary.
511 */
512#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
513({ \
514 char __ret; \
515 typeof(o1) __o1 = o1; \
516 typeof(o1) __n1 = n1; \
517 typeof(o2) __o2 = o2; \
518 typeof(o2) __n2 = n2; \
519 typeof(o2) __dummy; \
520 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
521 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
522 X86_FEATURE_CX16, \
523 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
524 "S" (&pcp1), "b"(__n1), "c"(__n2), \
525 "a"(__o1), "d"(__o2) : "memory"); \
526 __ret; \
527})
528
529#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
530#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
531#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
532
483#endif 533#endif
484 534
485/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 535/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index e2f6a99f14ab..56fd9e3abbda 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 */ 3 */
4 4
5#ifndef PERF_EVENT_P4_H 5#ifndef PERF_EVENT_P4_H
@@ -9,7 +9,7 @@
9#include <linux/bitops.h> 9#include <linux/bitops.h>
10 10
11/* 11/*
12 * NetBurst has perfomance MSRs shared between 12 * NetBurst has performance MSRs shared between
13 * threads if HT is turned on, ie for both logical 13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support 14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its 15 * perf-MSRs are not shared and every thread has its
@@ -22,6 +22,7 @@
22 22
23#define ARCH_P4_CNTRVAL_BITS (40) 23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1) 24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1))
25 26
26#define P4_ESCR_EVENT_MASK 0x7e000000U 27#define P4_ESCR_EVENT_MASK 0x7e000000U
27#define P4_ESCR_EVENT_SHIFT 25 28#define P4_ESCR_EVENT_SHIFT 25
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 94b979d1b58d..effff47a3c82 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd)
69 69
70static inline void pud_clear(pud_t *pudp) 70static inline void pud_clear(pud_t *pudp)
71{ 71{
72 unsigned long pgd;
73
74 set_pud(pudp, __pud(0)); 72 set_pud(pudp, __pud(0));
75 73
76 /* 74 /*
@@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp)
79 * section 8.1: in PAE mode we explicitly have to flush the 77 * section 8.1: in PAE mode we explicitly have to flush the
80 * TLB via cr3 if the top-level pgd is changed... 78 * TLB via cr3 if the top-level pgd is changed...
81 * 79 *
82 * Make sure the pud entry we're updating is within the 80 * Currently all places where pud_clear() is called either have
83 * current pgd to avoid unnecessary TLB flushes. 81 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
82 * pud_clear_bad()), so we don't need TLB flush here.
84 */ 83 */
85 pgd = read_cr3();
86 if (__pa(pudp) >= pgd && __pa(pudp) <
87 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
88 write_cr3(pgd);
89} 84}
90 85
91#ifdef CONFIG_SMP 86#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..a898a2b6e10c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -7,7 +7,7 @@
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ 10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ 12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ 13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa186..4c25ab48257b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
94 int x86_cache_alignment; /* In bytes */ 94 int x86_cache_alignment; /* In bytes */
95 int x86_power; 95 int x86_power;
96 unsigned long loops_per_jiffy; 96 unsigned long loops_per_jiffy;
97#ifdef CONFIG_SMP
98 /* cpus sharing the last level cache: */
99 cpumask_var_t llc_shared_map;
100#endif
101 /* cpuid returned max cores value: */ 97 /* cpuid returned max cores value: */
102 u16 x86_max_cores; 98 u16 x86_max_cores;
103 u16 apicid; 99 u16 apicid;
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b4ec95f07518..971e0b46446e 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -1 +1,69 @@
1/* dummy prom.h; here to make linux/of.h's #includes happy */ 1/*
2 * Definitions for Device tree / OpenFirmware handling on X86
3 *
4 * based on arch/powerpc/include/asm/prom.h which is
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _ASM_X86_PROM_H
14#define _ASM_X86_PROM_H
15#ifndef __ASSEMBLY__
16
17#include <linux/of.h>
18#include <linux/types.h>
19#include <linux/pci.h>
20
21#include <asm/irq.h>
22#include <asm/atomic.h>
23#include <asm/setup.h>
24#include <asm/irq_controller.h>
25
26#ifdef CONFIG_OF
27extern int of_ioapic;
28extern u64 initial_dtb;
29extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else
45static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { }
47static inline void x86_of_pci_init(void) { }
48static inline void x86_dtb_init(void) { }
49#define of_ioapic 0
50#endif
51
52extern char cmd_line[COMMAND_LINE_SIZE];
53
54#define pci_address_to_pio pci_address_to_pio
55unsigned long pci_address_to_pio(phys_addr_t addr);
56
57/**
58 * irq_dispose_mapping - Unmap an interrupt
59 * @virq: linux virq number of the interrupt to unmap
60 *
61 * FIXME: We really should implement proper virq handling like power,
62 * but that's going to be major surgery.
63 */
64static inline void irq_dispose_mapping(unsigned int virq) { }
65
66#define HAVE_ARCH_DEVTREE_FIXUPS
67
68#endif /* __ASSEMBLY__ */
69#endif
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 52b098a6eebb..7b0a55a88851 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -31,7 +31,7 @@
31#define R12 24 31#define R12 24
32#define RBP 32 32#define RBP 32
33#define RBX 40 33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/ 34/* arguments: interrupts/non tracing syscalls only save up to here*/
35#define R11 48 35#define R11 48
36#define R10 56 36#define R10 56
37#define R9 64 37#define R9 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 78cd1ea94500..1babf8adecdf 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -73,7 +73,7 @@ struct pt_regs {
73 unsigned long r12; 73 unsigned long r12;
74 unsigned long rbp; 74 unsigned long rbp;
75 unsigned long rbx; 75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/ 76/* arguments: non interrupts/non tracing syscalls only save up to here*/
77 unsigned long r11; 77 unsigned long r11;
78 unsigned long r10; 78 unsigned long r10;
79 unsigned long r9; 79 unsigned long r9;
@@ -103,7 +103,7 @@ struct pt_regs {
103 unsigned long r12; 103 unsigned long r12;
104 unsigned long bp; 104 unsigned long bp;
105 unsigned long bx; 105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/ 106/* arguments: non interrupts/non tracing syscalls only save up to here*/
107 unsigned long r11; 107 unsigned long r11;
108 unsigned long r10; 108 unsigned long r10;
109 unsigned long r9; 109 unsigned long r9;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..3250e3d605d9 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,7 +18,10 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */
23#define MRR_BIOS 0
24#define MRR_APM 1
22 25
23typedef void (*nmi_shootdown_cb)(int, struct die_args*); 26typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback); 27void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b60..df4cd32b4cc6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,26 +37,9 @@
37#endif 37#endif
38 38
39#ifdef __KERNEL__ 39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44#include <asm/asm.h> 40#include <asm/asm.h>
45 41
46struct rwsem_waiter;
47
48extern asmregparm struct rw_semaphore *
49 rwsem_down_read_failed(struct rw_semaphore *sem);
50extern asmregparm struct rw_semaphore *
51 rwsem_down_write_failed(struct rw_semaphore *sem);
52extern asmregparm struct rw_semaphore *
53 rwsem_wake(struct rw_semaphore *);
54extern asmregparm struct rw_semaphore *
55 rwsem_downgrade_wake(struct rw_semaphore *sem);
56
57/* 42/*
58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of 43 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647 44 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits. 45 * for 64 bits.
@@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore *
74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 57#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
76 59
77typedef signed long rwsem_count_t;
78
79struct rw_semaphore {
80 rwsem_count_t count;
81 spinlock_t wait_lock;
82 struct list_head wait_list;
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 struct lockdep_map dep_map;
85#endif
86};
87
88#ifdef CONFIG_DEBUG_LOCK_ALLOC
89# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
90#else
91# define __RWSEM_DEP_MAP_INIT(lockname)
92#endif
93
94
95#define __RWSEM_INITIALIZER(name) \
96{ \
97 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
98 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
99}
100
101#define DECLARE_RWSEM(name) \
102 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
103
104extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
105 struct lock_class_key *key);
106
107#define init_rwsem(sem) \
108do { \
109 static struct lock_class_key __key; \
110 \
111 __init_rwsem((sem), #sem, &__key); \
112} while (0)
113
114/* 60/*
115 * lock for reading 61 * lock for reading
116 */ 62 */
@@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem)
133 */ 79 */
134static inline int __down_read_trylock(struct rw_semaphore *sem) 80static inline int __down_read_trylock(struct rw_semaphore *sem)
135{ 81{
136 rwsem_count_t result, tmp; 82 long result, tmp;
137 asm volatile("# beginning __down_read_trylock\n\t" 83 asm volatile("# beginning __down_read_trylock\n\t"
138 " mov %0,%1\n\t" 84 " mov %0,%1\n\t"
139 "1:\n\t" 85 "1:\n\t"
@@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
155 */ 101 */
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 102static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 103{
158 rwsem_count_t tmp; 104 long tmp;
159 asm volatile("# beginning down_write\n\t" 105 asm volatile("# beginning down_write\n\t"
160 LOCK_PREFIX " xadd %1,(%2)\n\t" 106 LOCK_PREFIX " xadd %1,(%2)\n\t"
161 /* adds 0xffff0001, returns the old value */ 107 /* adds 0xffff0001, returns the old value */
@@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem)
180 */ 126 */
181static inline int __down_write_trylock(struct rw_semaphore *sem) 127static inline int __down_write_trylock(struct rw_semaphore *sem)
182{ 128{
183 rwsem_count_t ret = cmpxchg(&sem->count, 129 long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
184 RWSEM_UNLOCKED_VALUE, 130 RWSEM_ACTIVE_WRITE_BIAS);
185 RWSEM_ACTIVE_WRITE_BIAS);
186 if (ret == RWSEM_UNLOCKED_VALUE) 131 if (ret == RWSEM_UNLOCKED_VALUE)
187 return 1; 132 return 1;
188 return 0; 133 return 0;
@@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
193 */ 138 */
194static inline void __up_read(struct rw_semaphore *sem) 139static inline void __up_read(struct rw_semaphore *sem)
195{ 140{
196 rwsem_count_t tmp; 141 long tmp;
197 asm volatile("# beginning __up_read\n\t" 142 asm volatile("# beginning __up_read\n\t"
198 LOCK_PREFIX " xadd %1,(%2)\n\t" 143 LOCK_PREFIX " xadd %1,(%2)\n\t"
199 /* subtracts 1, returns the old value */ 144 /* subtracts 1, returns the old value */
@@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem)
211 */ 156 */
212static inline void __up_write(struct rw_semaphore *sem) 157static inline void __up_write(struct rw_semaphore *sem)
213{ 158{
214 rwsem_count_t tmp; 159 long tmp;
215 asm volatile("# beginning __up_write\n\t" 160 asm volatile("# beginning __up_write\n\t"
216 LOCK_PREFIX " xadd %1,(%2)\n\t" 161 LOCK_PREFIX " xadd %1,(%2)\n\t"
217 /* subtracts 0xffff0001, returns the old value */ 162 /* subtracts 0xffff0001, returns the old value */
@@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
247/* 192/*
248 * implement atomic add functionality 193 * implement atomic add functionality
249 */ 194 */
250static inline void rwsem_atomic_add(rwsem_count_t delta, 195static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
251 struct rw_semaphore *sem)
252{ 196{
253 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" 197 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
254 : "+m" (sem->count) 198 : "+m" (sem->count)
@@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
258/* 202/*
259 * implement exchange and add functionality 203 * implement exchange and add functionality
260 */ 204 */
261static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, 205static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
262 struct rw_semaphore *sem)
263{ 206{
264 rwsem_count_t tmp = delta; 207 long tmp = delta;
265 208
266 asm volatile(LOCK_PREFIX "xadd %0,%1" 209 asm volatile(LOCK_PREFIX "xadd %0,%1"
267 : "+r" (tmp), "+m" (sem->count) 210 : "+r" (tmp), "+m" (sem->count)
@@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
270 return tmp + delta; 213 return tmp + delta;
271} 214}
272 215
273static inline int rwsem_is_locked(struct rw_semaphore *sem)
274{
275 return (sem->count != 0);
276}
277
278#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
279#endif /* _ASM_X86_RWSEM_H */ 217#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 231f1c1d6607..cd84f7208f76 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,14 +1,16 @@
1#ifndef _ASM_X86_SEGMENT_H 1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H 2#define _ASM_X86_SEGMENT_H
3 3
4#include <linux/const.h>
5
4/* Constructor for a conventional segment GDT (or LDT) entry */ 6/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */ 7/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \ 8#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \ 9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \ 10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \ 11 (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \ 12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
11 (((limit) & 0x0000ffffULL))) 13 (((limit) & _AC(0x0000ffff,ULL))))
12 14
13/* Simple and small GDT entries for booting only */ 15/* Simple and small GDT entries for booting only */
14 16
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c7fc1b..73b11bc0ae6f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,12 +17,24 @@
17#endif 17#endif
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/cpumask.h> 19#include <asm/cpumask.h>
20#include <asm/cpufeature.h>
20 21
21extern int smp_num_siblings; 22extern int smp_num_siblings;
22extern unsigned int num_processors; 23extern unsigned int num_processors;
23 24
25static inline bool cpu_has_ht_siblings(void)
26{
27 bool has_siblings = false;
28#ifdef CONFIG_SMP
29 has_siblings = cpu_has_ht && smp_num_siblings > 1;
30#endif
31 return has_siblings;
32}
33
24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU(int, cpu_number);
28 40
@@ -36,14 +48,19 @@ static inline struct cpumask *cpu_core_mask(int cpu)
36 return per_cpu(cpu_core_map, cpu); 48 return per_cpu(cpu_core_map, cpu);
37} 49}
38 50
51static inline struct cpumask *cpu_llc_shared_mask(int cpu)
52{
53 return per_cpu(cpu_llc_shared_map, cpu);
54}
55
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
60#endif
41 61
42/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
43extern struct { 63extern unsigned long stack_start; /* Initial stack pointer address */
44 void *sp;
45 unsigned short ss;
46} stack_start;
47 64
48struct smp_ops { 65struct smp_ops {
49 void (*smp_prepare_boot_cpu)(void); 66 void (*smp_prepare_boot_cpu)(void);
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 6c22bf353f26..725b77831993 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -34,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
34 */ 34 */
35 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
36 36
37 *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; 37 *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
38} 38}
39 39
40static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 52b5c7ed3608..d7e89c83645d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -47,7 +47,7 @@ struct stacktrace_ops {
47}; 47};
48 48
49void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
50 unsigned long *stack, 50 unsigned long *stack, unsigned long bp,
51 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
52 52
53#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -86,11 +86,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs)
86 86
87extern void 87extern void
88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
89 unsigned long *stack, char *log_lvl); 89 unsigned long *stack, unsigned long bp, char *log_lvl);
90 90
91extern void 91extern void
92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
93 unsigned long *sp, char *log_lvl); 93 unsigned long *sp, unsigned long bp, char *log_lvl);
94 94
95extern unsigned int code_bytes; 95extern unsigned int code_bytes;
96 96
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3ea8782..12569e691ce3 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do { \
98 */ 98 */
99#define HAVE_DISABLE_HLT 99#define HAVE_DISABLE_HLT
100#else 100#else
101#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
102#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
103 101
104/* frame pointer must be last for get_wchan */ 102/* frame pointer must be last for get_wchan */
105#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" 103#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad09..000000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a0..1f2e61e28981 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -161,8 +161,14 @@ struct thread_info {
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
163 163
164#define alloc_thread_info(tsk) \ 164#define alloc_thread_info_node(tsk, node) \
165 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) 165({ \
166 struct page *page = alloc_pages_node(node, THREAD_FLAGS, \
167 THREAD_ORDER); \
168 struct thread_info *ret = page ? page_address(page) : NULL; \
169 \
170 ret; \
171})
166 172
167#ifdef CONFIG_X86_32 173#ifdef CONFIG_X86_32
168 174
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc31e52..910a7084f7f2 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
47 47
48#include <asm/mpspec.h> 48#include <asm/mpspec.h>
49 49
50#ifdef CONFIG_X86_32
51
52/* Mappings between logical cpu number and node number */
53extern int cpu_to_node_map[];
54
55/* Returns the number of the node containing CPU 'cpu' */
56static inline int __cpu_to_node(int cpu)
57{
58 return cpu_to_node_map[cpu];
59}
60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
62
63#else /* CONFIG_X86_64 */
64
65/* Mappings between logical cpu number and node number */ 50/* Mappings between logical cpu number and node number */
66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 51DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
67 52
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
84 69
85#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
86 71
87#endif /* CONFIG_X86_64 */
88
89/* Mappings between node number and cpus on that node. */ 72/* Mappings between node number and cpus on that node. */
90extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 73extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
91 74
@@ -155,7 +138,7 @@ extern unsigned long node_remap_size[];
155 .balance_interval = 1, \ 138 .balance_interval = 1, \
156} 139}
157 140
158#ifdef CONFIG_X86_64_ACPI_NUMA 141#ifdef CONFIG_X86_64
159extern int __node_distance(int, int); 142extern int __node_distance(int, int);
160#define node_distance(a, b) __node_distance(a, b) 143#define node_distance(a, b) __node_distance(a, b)
161#endif 144#endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index f4500fb3b485..feca3118a73b 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,25 +3,36 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE 6#include <linux/types.h>
7#include <asm/io.h>
8
7/* 9/*
8 * Trampoline 80x86 program as an array. 10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
9 */ 13 */
10extern const unsigned char trampoline_data []; 14extern const unsigned char x86_trampoline_start [];
11extern const unsigned char trampoline_end []; 15extern const unsigned char x86_trampoline_end [];
12extern unsigned char *trampoline_base; 16extern unsigned char *x86_trampoline_base;
13 17
14extern unsigned long init_rsp; 18extern unsigned long init_rsp;
15extern unsigned long initial_code; 19extern unsigned long initial_code;
16extern unsigned long initial_gs; 20extern unsigned long initial_gs;
17 21
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
19 30
20extern unsigned long setup_trampoline(void); 31/* Address of the SMP trampoline */
21extern void __init reserve_trampoline_memory(void); 32static inline unsigned long trampoline_address(void)
22#else 33{
23static inline void reserve_trampoline_memory(void) {} 34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
24#endif /* CONFIG_X86_TRAMPOLINE */ 35}
25 36
26#endif /* __ASSEMBLY__ */ 37#endif /* __ASSEMBLY__ */
27 38
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d03..83e2efd181e2 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void)
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 /* 37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
40 */ 40 */
41#ifndef CONFIG_X86_TSC 41#ifndef CONFIG_X86_TSC
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index df1da20f4534..8e8c23fef08c 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,22 +1,6 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#define dma_addr_t dma_addr_t
5
6#include <asm-generic/types.h> 4#include <asm-generic/types.h>
7 5
8#ifdef __KERNEL__
9#ifndef __ASSEMBLY__
10
11typedef u64 dma64_addr_t;
12#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
13/* DMA addresses come in 32-bit and 64-bit flavours. */
14typedef u64 dma_addr_t;
15#else
16typedef u32 dma_addr_t;
17#endif
18
19#endif /* __ASSEMBLY__ */
20#endif /* __KERNEL__ */
21
22#endif /* _ASM_X86_TYPES_H */ 6#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e8ba0e..a755ef5e5977 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,14 @@
346#define __NR_fanotify_init 338 346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339 347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340 348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
349 353
350#ifdef __KERNEL__ 354#ifdef __KERNEL__
351 355
352#define NR_syscalls 341 356#define NR_syscalls 345
353 357
354#define __ARCH_WANT_IPC_PARSE_VERSION 358#define __ARCH_WANT_IPC_PARSE_VERSION
355#define __ARCH_WANT_OLD_READDIR 359#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8a715b..160fa76bd578 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,14 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302 670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64) 671__SYSCALL(__NR_prlimit64, sys_prlimit64)
672#define __NR_name_to_handle_at 303
673__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
674#define __NR_open_by_handle_at 304
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs)
672 680
673#ifndef __NO_STUBS 681#ifndef __NO_STUBS
674#define __ARCH_WANT_OLD_READDIR 682#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index ce1d54c8a433..3e094af443c3 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -176,7 +176,7 @@ struct bau_msg_payload {
176struct bau_msg_header { 176struct bau_msg_header {
177 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 177 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
178 /* bits 5:0 */ 178 /* bits 5:0 */
179 unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ 179 unsigned int base_dest_nodeid:15; /* nasid of the */
180 /* bits 20:6 */ /* first bit in uvhub map */ 180 /* bits 20:6 */ /* first bit in uvhub map */
181 unsigned int command:8; /* message type */ 181 unsigned int command:8; /* message type */
182 /* bits 28:21 */ 182 /* bits 28:21 */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 64642ad019fb..643ebf2e2ad8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,11 +83,13 @@ struct x86_init_paging {
83 * boot cpu 83 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init 84 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET) 85 * @timer_init: initialize the platform timer (default PIT/HPET)
86 * @wallclock_init: init the wallclock device
86 */ 87 */
87struct x86_init_timers { 88struct x86_init_timers {
88 void (*setup_percpu_clockev)(void); 89 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void); 90 void (*tsc_pre_init)(void);
90 void (*timer_init)(void); 91 void (*timer_init)(void);
92 void (*wallclock_init)(void);
91}; 93};
92 94
93/** 95/**
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae4025b..8508bfe52296 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
287static inline int 287static inline int
288HYPERVISOR_sched_op(int cmd, void *arg) 288HYPERVISOR_sched_op(int cmd, void *arg)
289{ 289{
290 return _hypercall2(int, sched_op_new, cmd, arg); 290 return _hypercall2(int, sched_op, cmd, arg);
291} 291}
292 292
293static inline long 293static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
422#endif 422#endif
423 423
424static inline int 424static inline int
425HYPERVISOR_suspend(unsigned long srec) 425HYPERVISOR_suspend(unsigned long start_info_mfn)
426{ 426{
427 return _hypercall3(int, sched_op, SCHEDOP_shutdown, 427 struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
428 SHUTDOWN_suspend, srec); 428
429 /*
430 * For a PV guest the tools require that the start_info mfn be
431 * present in rdx/edx when the hypercall is made. Per the
432 * hypercall calling convention this is the third hypercall
433 * argument, which is start_info_mfn here.
434 */
435 return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
429} 436}
430 437
431static inline int 438static inline int
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 1c10c88ee4e1..5d4922ad4b9b 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void);
86 * The privilege level specifies which modes may enter a trap via a software 86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows: 88 * privilege levels as follows:
89 * Level == 0: Noone may enter 89 * Level == 0: No one may enter
90 * Level == 1: Kernel may enter 90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter 91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter 92 * Level == 3: Everyone may enter
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf238a33..c61934fbf22a 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
29 29
30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
31#define INVALID_P2M_ENTRY (~0UL) 31#define INVALID_P2M_ENTRY (~0UL)
32#define FOREIGN_FRAME_BIT (1UL<<31) 32#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
33#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
33#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 34#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
35#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
34 36
35/* Maximum amount of memory we can handle in a domain in pages */ 37/* Maximum amount of memory we can handle in a domain in pages */
36#define MAX_DOMAIN_PAGES \ 38#define MAX_DOMAIN_PAGES \
@@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order;
41 43
42extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
46extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e);
44 49
45extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page); 51extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn); 52extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49 54
55#ifdef CONFIG_XEN_DEBUG_FS
56extern int p2m_dump_show(struct seq_file *m, void *v);
57#endif
50static inline unsigned long pfn_to_mfn(unsigned long pfn) 58static inline unsigned long pfn_to_mfn(unsigned long pfn)
51{ 59{
52 unsigned long mfn; 60 unsigned long mfn;
@@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
57 mfn = get_phys_to_machine(pfn); 65 mfn = get_phys_to_machine(pfn);
58 66
59 if (mfn != INVALID_P2M_ENTRY) 67 if (mfn != INVALID_P2M_ENTRY)
60 mfn &= ~FOREIGN_FRAME_BIT; 68 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
61 69
62 return mfn; 70 return mfn;
63} 71}
@@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73static inline unsigned long mfn_to_pfn(unsigned long mfn) 81static inline unsigned long mfn_to_pfn(unsigned long mfn)
74{ 82{
75 unsigned long pfn; 83 unsigned long pfn;
84 int ret = 0;
76 85
77 if (xen_feature(XENFEAT_auto_translated_physmap)) 86 if (xen_feature(XENFEAT_auto_translated_physmap))
78 return mfn; 87 return mfn;
79 88
89 if (unlikely((mfn >> machine_to_phys_order) != 0)) {
90 pfn = ~0;
91 goto try_override;
92 }
80 pfn = 0; 93 pfn = 0;
81 /* 94 /*
82 * The array access can fail (e.g., device space beyond end of RAM). 95 * The array access can fail (e.g., device space beyond end of RAM).
83 * In such cases it doesn't matter what we return (we return garbage), 96 * In such cases it doesn't matter what we return (we return garbage),
84 * but we must handle the fault without crashing! 97 * but we must handle the fault without crashing!
85 */ 98 */
86 __get_user(pfn, &machine_to_phys_mapping[mfn]); 99 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
87 100try_override:
88 /* 101 /* ret might be < 0 if there are no entries in the m2p for mfn */
89 * If this appears to be a foreign mfn (because the pfn 102 if (ret < 0)
90 * doesn't map back to the mfn), then check the local override 103 pfn = ~0;
91 * table to see if there's a better pfn to use. 104 else if (get_phys_to_machine(pfn) != mfn)
105 /*
106 * If this appears to be a foreign mfn (because the pfn
107 * doesn't map back to the mfn), then check the local override
108 * table to see if there's a better pfn to use.
109 *
110 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
111 */
112 pfn = m2p_find_override_pfn(mfn, ~0);
113
114 /*
115 * pfn is ~0 if there are no entries in the m2p for mfn or if the
116 * entry doesn't map back to the mfn and m2p_override doesn't have a
117 * valid entry for it.
92 */ 118 */
93 if (get_phys_to_machine(pfn) != mfn) 119 if (pfn == ~0 &&
94 pfn = m2p_find_override_pfn(mfn, pfn); 120 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
121 pfn = mfn;
95 122
96 return pfn; 123 return pfn;
97} 124}
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3eaf8d3..aa8620989162 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void)
27 * its own functions. 27 * its own functions.
28 */ 28 */
29struct xen_pci_frontend_ops { 29struct xen_pci_frontend_ops {
30 int (*enable_msi)(struct pci_dev *dev, int **vectors); 30 int (*enable_msi)(struct pci_dev *dev, int vectors[]);
31 void (*disable_msi)(struct pci_dev *dev); 31 void (*disable_msi)(struct pci_dev *dev);
32 int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); 32 int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
33 void (*disable_msix)(struct pci_dev *dev); 33 void (*disable_msix)(struct pci_dev *dev);
34}; 34};
35 35
36extern struct xen_pci_frontend_ops *xen_pci_frontend; 36extern struct xen_pci_frontend_ops *xen_pci_frontend;
37 37
38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, 38static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
39 int **vectors) 39 int vectors[])
40{ 40{
41 if (xen_pci_frontend && xen_pci_frontend->enable_msi) 41 if (xen_pci_frontend && xen_pci_frontend->enable_msi)
42 return xen_pci_frontend->enable_msi(dev, vectors); 42 return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
48 xen_pci_frontend->disable_msi(dev); 48 xen_pci_frontend->disable_msi(dev);
49} 49}
50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, 50static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
51 int **vectors, int nvec) 51 int vectors[], int nvec)
52{ 52{
53 if (xen_pci_frontend && xen_pci_frontend->enable_msix) 53 if (xen_pci_frontend && xen_pci_frontend->enable_msix)
54 return xen_pci_frontend->enable_msix(dev, vectors, nvec); 54 return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd880..7338ef2218bc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
43obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o 48obj-y += resource.o
49 49
50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-y += trampoline.o trampoline_$(BITS).o
51obj-y += process.o 51obj-y += process.o
52obj-y += i387.o xsave.o 52obj-y += i387.o xsave.o
53obj-y += ptrace.o 53obj-y += ptrace.o
@@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o
55obj-$(CONFIG_IA32_EMULATION) += tls.o 55obj-$(CONFIG_IA32_EMULATION) += tls.o
56obj-y += step.o 56obj-y += step.o
57obj-$(CONFIG_INTEL_TXT) += tboot.o 57obj-$(CONFIG_INTEL_TXT) += tboot.o
58obj-$(CONFIG_ISA_DMA_API) += i8237.o
58obj-$(CONFIG_STACKTRACE) += stacktrace.o 59obj-$(CONFIG_STACKTRACE) += stacktrace.o
59obj-y += cpu/ 60obj-y += cpu/
60obj-y += acpi/ 61obj-y += acpi/
61obj-y += reboot.o 62obj-y += reboot.o
63obj-$(CONFIG_X86_32) += reboot_32.o
62obj-$(CONFIG_MCA) += mca_32.o 64obj-$(CONFIG_MCA) += mca_32.o
63obj-$(CONFIG_X86_MSR) += msr.o 65obj-$(CONFIG_X86_MSR) += msr.o
64obj-$(CONFIG_X86_CPUID) += cpuid.o 66obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -66,10 +68,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
66apm-y := apm_32.o 68apm-y := apm_32.o
67obj-$(CONFIG_APM) += apm.o 69obj-$(CONFIG_APM) += apm.o
68obj-$(CONFIG_SMP) += smp.o 70obj-$(CONFIG_SMP) += smp.o
69obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 71obj-$(CONFIG_SMP) += smpboot.o
72obj-$(CONFIG_SMP) += tsc_sync.o
70obj-$(CONFIG_SMP) += setup_percpu.o 73obj-$(CONFIG_SMP) += setup_percpu.o
71obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
72obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
73obj-$(CONFIG_X86_MPPARSE) += mpparse.o 74obj-$(CONFIG_X86_MPPARSE) += mpparse.o
74obj-y += apic/ 75obj-y += apic/
75obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 76obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -109,6 +110,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
110 111
111obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 112obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
113obj-$(CONFIG_OF) += devicetree.o
112 114
113### 115###
114# 64 bit specific files 116# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b3a71137983a..9a966c579af5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata;
72int acpi_sci_override_gsi __initdata; 72int acpi_sci_override_gsi __initdata;
73int acpi_skip_timer_override __initdata; 73int acpi_skip_timer_override __initdata;
74int acpi_use_timer_override __initdata; 74int acpi_use_timer_override __initdata;
75int acpi_fix_pin2_polarity __initdata;
75 76
76#ifdef CONFIG_X86_LOCAL_APIC 77#ifdef CONFIG_X86_LOCAL_APIC
77static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 78static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -415,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
415 return 0; 416 return 0;
416 } 417 }
417 418
418 if (acpi_skip_timer_override && 419 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
419 intsrc->source_irq == 0 && intsrc->global_irq == 2) { 420 if (acpi_skip_timer_override) {
420 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 421 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
421 return 0; 422 return 0;
423 }
424 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
425 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
426 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
427 }
422 } 428 }
423 429
424 mp_override_legacy_irq(intsrc->source_irq, 430 mp_override_legacy_irq(intsrc->source_irq,
@@ -589,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
589 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
590 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
591 return; 597 return;
592#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
593 apicid_to_node[physid] = nid;
594 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
595#else /* CONFIG_X86_32 */
596 apicid_2_node[physid] = nid;
597 cpu_to_node_map[cpu] = nid;
598#endif
599
600#endif 600#endif
601} 601}
602 602
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..ead21b663117 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 36wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 37wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 38wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 39signature: .long WAKEUP_HEADER_SIGNATURE
34 40
35 .text 41 .text
36 .globl _start
37 .code16 42 .code16
38wakeup_code: 43wakeup_code:
39_start:
40 cli
41 cld 44 cld
42 45
43 /* Apparently some dimwit BIOS programmers don't know how to 46 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,12 +80,12 @@ _start:
77 80
78 /* Check header signature... */ 81 /* Check header signature... */
79 movl signature, %eax 82 movl signature, %eax
80 cmpl $0x51ee1111, %eax 83 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 84 jne bogus_real_magic
82 85
83 /* Check we really have everything... */ 86 /* Check we really have everything... */
84 movl end_signature, %eax 87 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 88 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 89 jne bogus_real_magic
87 90
88 /* Call the C code */ 91 /* Call the C code */
@@ -147,3 +150,7 @@ wakeup_heap:
147wakeup_stack: 150wakeup_stack:
148 .space 2048 151 .space 2048
149wakeup_stack_end: 152wakeup_stack_end:
153
154 .section ".signature","a"
155end_signature:
156 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..e1828c07e79c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -35,7 +35,8 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 35extern struct wakeup_header wakeup_header;
36#endif 36#endif
37 37
38#define HEADER_OFFSET 0x3f00 38#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 39#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
40#define WAKEUP_END_SIGNATURE 0x65a22c82
40 41
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 42#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 69fd72aa5594..ff93bc1b09c3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,45 +12,34 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/desc.h> 14#include <asm/desc.h>
15
16#ifdef CONFIG_X86_32
17#include <asm/pgtable.h> 15#include <asm/pgtable.h>
18#endif 16#include <asm/cacheflush.h>
19 17
20#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
21#include "sleep.h" 19#include "sleep.h"
22 20
23unsigned long acpi_wakeup_address;
24unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
25 22
26/* address in low memory of the wakeup routine. */
27static unsigned long acpi_realmode;
28
29#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
30static char temp_stack[4096]; 24static char temp_stack[4096];
31#endif 25#endif
32 26
33/** 27/**
34 * acpi_save_state_mem - save kernel state 28 * acpi_suspend_lowlevel - save kernel state
35 * 29 *
36 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
37 * low memory. 31 * low memory.
38 *
39 * Note that this is too late to change acpi_wakeup_address.
40 */ 32 */
41int acpi_save_state_mem(void) 33int acpi_suspend_lowlevel(void)
42{ 34{
43 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
44 38
45 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
46 printk(KERN_ERR "Could not allocate memory during boot, "
47 "S3 disabled\n");
48 return -ENOMEM;
49 }
50 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
51 40
52 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
53 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
54 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
55 return -EINVAL; 44 return -EINVAL;
56 } 45 }
@@ -70,9 +59,7 @@ int acpi_save_state_mem(void)
70 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
71 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
72 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
73 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
74 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
75 << 16);
76 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
77 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
78 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -98,9 +85,9 @@ int acpi_save_state_mem(void)
98 header->pmode_cr3 = (u32)__pa(&initial_page_table); 85 header->pmode_cr3 = (u32)__pa(&initial_page_table);
99 saved_magic = 0x12345678; 86 saved_magic = 0x12345678;
100#else /* CONFIG_64BIT */ 87#else /* CONFIG_64BIT */
101 header->trampoline_segment = setup_trampoline() >> 4; 88 header->trampoline_segment = trampoline_address() >> 4;
102#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
103 stack_start.sp = temp_stack + sizeof(temp_stack); 90 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
104 early_gdt_descr.address = 91 early_gdt_descr.address =
105 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 92 (unsigned long)get_cpu_gdt_table(smp_processor_id());
106 initial_gs = per_cpu_offset(smp_processor_id()); 93 initial_gs = per_cpu_offset(smp_processor_id());
@@ -109,47 +96,10 @@ int acpi_save_state_mem(void)
109 saved_magic = 0x123456789abcdef0L; 96 saved_magic = 0x123456789abcdef0L;
110#endif /* CONFIG_64BIT */ 97#endif /* CONFIG_64BIT */
111 98
99 do_suspend_lowlevel();
112 return 0; 100 return 0;
113} 101}
114 102
115/*
116 * acpi_restore_state - undo effects of acpi_save_state_mem
117 */
118void acpi_restore_state_mem(void)
119{
120}
121
122
123/**
124 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
125 *
126 * We allocate a page from the first 1MB of memory for the wakeup
127 * routine for when we come back from a sleep state. The
128 * runtime allocator allows specification of <16MB pages, but not
129 * <1MB pages.
130 */
131void __init acpi_reserve_wakeup_memory(void)
132{
133 phys_addr_t mem;
134
135 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
136 printk(KERN_ERR
137 "ACPI: Wakeup code way too big, S3 disabled.\n");
138 return;
139 }
140
141 mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
142
143 if (mem == MEMBLOCK_ERROR) {
144 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
145 return;
146 }
147 acpi_realmode = (unsigned long) phys_to_virt(mem);
148 acpi_wakeup_address = mem;
149 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
150}
151
152
153static int __init acpi_sleep_setup(char *str) 103static int __init acpi_sleep_setup(char *str)
154{ 104{
155 while ((str != NULL) && (*str != '\0')) { 105 while ((str != NULL) && (*str != '\0')) {
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
14
15extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 123608531c8f..4a234677e213 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -199,7 +199,7 @@ void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 201 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 202 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 203 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 204 Tough. Make sure you disable such features by hand. */
205 205
@@ -620,7 +620,12 @@ static int __kprobes stop_machine_text_poke(void *data)
620 flush_icache_range((unsigned long)p->addr, 620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len); 621 (unsigned long)p->addr + p->len);
622 } 622 }
623 623 /*
624 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
625 * that a core serializing instruction such as "cpuid" should be
626 * executed on _each_ core before the new instruction is made visible.
627 */
628 sync_core();
624 return 0; 629 return 0;
625} 630}
626 631
@@ -671,7 +676,7 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
671 676
672 atomic_set(&stop_machine_first, 1); 677 atomic_set(&stop_machine_first, 1);
673 wrote_text = 0; 678 wrote_text = 0;
674 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 679 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
675} 680}
676 681
677#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 682#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6e11c8134158..246d727b65b7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -21,7 +21,7 @@
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
@@ -1260,7 +1260,7 @@ static void disable_iommus(void)
1260 * disable suspend until real resume implemented 1260 * disable suspend until real resume implemented
1261 */ 1261 */
1262 1262
1263static int amd_iommu_resume(struct sys_device *dev) 1263static void amd_iommu_resume(void)
1264{ 1264{
1265 struct amd_iommu *iommu; 1265 struct amd_iommu *iommu;
1266 1266
@@ -1276,11 +1276,9 @@ static int amd_iommu_resume(struct sys_device *dev)
1276 */ 1276 */
1277 amd_iommu_flush_all_devices(); 1277 amd_iommu_flush_all_devices();
1278 amd_iommu_flush_all_domains(); 1278 amd_iommu_flush_all_domains();
1279
1280 return 0;
1281} 1279}
1282 1280
1283static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1281static int amd_iommu_suspend(void)
1284{ 1282{
1285 /* disable IOMMUs to go out of the way for BIOS */ 1283 /* disable IOMMUs to go out of the way for BIOS */
1286 disable_iommus(); 1284 disable_iommus();
@@ -1288,17 +1286,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
1288 return 0; 1286 return 0;
1289} 1287}
1290 1288
1291static struct sysdev_class amd_iommu_sysdev_class = { 1289static struct syscore_ops amd_iommu_syscore_ops = {
1292 .name = "amd_iommu",
1293 .suspend = amd_iommu_suspend, 1290 .suspend = amd_iommu_suspend,
1294 .resume = amd_iommu_resume, 1291 .resume = amd_iommu_resume,
1295}; 1292};
1296 1293
1297static struct sys_device device_amd_iommu = {
1298 .id = 0,
1299 .cls = &amd_iommu_sysdev_class,
1300};
1301
1302/* 1294/*
1303 * This is the core init function for AMD IOMMU hardware in the system. 1295 * This is the core init function for AMD IOMMU hardware in the system.
1304 * This function is called from the generic x86 DMA layer initialization 1296 * This function is called from the generic x86 DMA layer initialization
@@ -1415,14 +1407,6 @@ static int __init amd_iommu_init(void)
1415 goto free; 1407 goto free;
1416 } 1408 }
1417 1409
1418 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1419 if (ret)
1420 goto free;
1421
1422 ret = sysdev_register(&device_amd_iommu);
1423 if (ret)
1424 goto free;
1425
1426 ret = amd_iommu_init_devices(); 1410 ret = amd_iommu_init_devices();
1427 if (ret) 1411 if (ret)
1428 goto free; 1412 goto free;
@@ -1441,6 +1425,8 @@ static int __init amd_iommu_init(void)
1441 1425
1442 amd_iommu_init_notifier(); 1426 amd_iommu_init_notifier();
1443 1427
1428 register_syscore_ops(&amd_iommu_syscore_ops);
1429
1444 if (iommu_pass_through) 1430 if (iommu_pass_through)
1445 goto out; 1431 goto out;
1446 1432
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 0a99f7198bc3..4c39baa8facc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,14 +12,19 @@
12 12
13static u32 *flush_words; 13static u32 *flush_words;
14 14
15struct pci_device_id amd_nb_misc_ids[] = { 15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
25 {}
26};
27
23const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { 28const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
24 { 0x00, 0x18, 0x20 }, 29 { 0x00, 0x18, 0x20 },
25 { 0xff, 0x00, 0x20 }, 30 { 0xff, 0x00, 0x20 },
@@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges;
31EXPORT_SYMBOL(amd_northbridges); 36EXPORT_SYMBOL(amd_northbridges);
32 37
33static struct pci_dev *next_northbridge(struct pci_dev *dev, 38static struct pci_dev *next_northbridge(struct pci_dev *dev,
34 struct pci_device_id *ids) 39 const struct pci_device_id *ids)
35{ 40{
36 do { 41 do {
37 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); 42 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
@@ -43,9 +48,9 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
43 48
44int amd_cache_northbridges(void) 49int amd_cache_northbridges(void)
45{ 50{
46 int i = 0; 51 u16 i = 0;
47 struct amd_northbridge *nb; 52 struct amd_northbridge *nb;
48 struct pci_dev *misc; 53 struct pci_dev *misc, *link;
49 54
50 if (amd_nb_num()) 55 if (amd_nb_num())
51 return 0; 56 return 0;
@@ -64,10 +69,12 @@ int amd_cache_northbridges(void)
64 amd_northbridges.nb = nb; 69 amd_northbridges.nb = nb;
65 amd_northbridges.num = i; 70 amd_northbridges.num = i;
66 71
67 misc = NULL; 72 link = misc = NULL;
68 for (i = 0; i != amd_nb_num(); i++) { 73 for (i = 0; i != amd_nb_num(); i++) {
69 node_to_amd_nb(i)->misc = misc = 74 node_to_amd_nb(i)->misc = misc =
70 next_northbridge(misc, amd_nb_misc_ids); 75 next_northbridge(misc, amd_nb_misc_ids);
76 node_to_amd_nb(i)->link = link =
77 next_northbridge(link, amd_nb_link_ids);
71 } 78 }
72 79
73 /* some CPU families (e.g. family 0x11) do not support GART */ 80 /* some CPU families (e.g. family 0x11) do not support GART */
@@ -85,26 +92,95 @@ int amd_cache_northbridges(void)
85 boot_cpu_data.x86_mask >= 0x1)) 92 boot_cpu_data.x86_mask >= 0x1))
86 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; 93 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
87 94
95 if (boot_cpu_data.x86 == 0x15)
96 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
97
98 /* L3 cache partitioning is supported on family 0x15 */
99 if (boot_cpu_data.x86 == 0x15)
100 amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
101
88 return 0; 102 return 0;
89} 103}
90EXPORT_SYMBOL_GPL(amd_cache_northbridges); 104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
91 105
92/* Ignores subdevice/subvendor but as far as I can figure out 106/*
93 they're useless anyways */ 107 * Ignores subdevice/subvendor but as far as I can figure out
94int __init early_is_amd_nb(u32 device) 108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
95{ 111{
96 struct pci_device_id *id; 112 const struct pci_device_id *id;
97 u32 vendor = device & 0xffff; 113 u32 vendor = device & 0xffff;
114
98 device >>= 16; 115 device >>= 16;
99 for (id = amd_nb_misc_ids; id->vendor; id++) 116 for (id = amd_nb_misc_ids; id->vendor; id++)
100 if (vendor == id->vendor && device == id->device) 117 if (vendor == id->vendor && device == id->device)
101 return 1; 118 return true;
119 return false;
120}
121
122int amd_get_subcaches(int cpu)
123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask;
126 int cuid = 0;
127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0;
130
131 pci_read_config_dword(link, 0x1d4, &mask);
132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf;
137}
138
139int amd_set_subcaches(int cpu, int mask)
140{
141 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg;
144 int cuid = 0;
145
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL;
148
149 /* if necessary, collect reset state of L3 partitioning and BAN mode */
150 if (reset == 0) {
151 pci_read_config_dword(nb->link, 0x1d4, &reset);
152 pci_read_config_dword(nb->misc, 0x1b8, &ban);
153 ban &= 0x180000;
154 }
155
156 /* deactivate BAN mode if any subcaches are to be disabled */
157 if (mask != 0xf) {
158 pci_read_config_dword(nb->misc, 0x1b8, &reg);
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 }
161
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26;
167
168 pci_write_config_dword(nb->link, 0x1d4, mask);
169
170 /* reset BAN mode if L3 partitioning returned to reset state */
171 pci_read_config_dword(nb->link, 0x1d4, &reg);
172 if (reg == reset) {
173 pci_read_config_dword(nb->misc, 0x1b8, &reg);
174 reg &= ~0x180000;
175 pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
176 }
177
102 return 0; 178 return 0;
103} 179}
104 180
105int amd_cache_gart(void) 181static int amd_cache_gart(void)
106{ 182{
107 int i; 183 u16 i;
108 184
109 if (!amd_nb_has_feature(AMD_NB_GART)) 185 if (!amd_nb_has_feature(AMD_NB_GART))
110 return 0; 186 return 0;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 51ef31a89be9..cd1ffed4ee22 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -284,7 +284,7 @@ static int __init apbt_clockevent_register(void)
284 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 284 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
285 285
286 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 286 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
287 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 287 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
288 global_clock_event = &adev->evt; 288 global_clock_event = &adev->evt;
289 printk(KERN_DEBUG "%s clockevent registered as global\n", 289 printk(KERN_DEBUG "%s clockevent registered as global\n",
290 global_clock_event->name); 290 global_clock_event->name);
@@ -316,7 +316,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
316 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); 316 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
317 irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); 317 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */ 318 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); 319 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
320 320
321 if (system_state == SYSTEM_BOOTING) { 321 if (system_state == SYSTEM_BOOTING) {
322 if (request_irq(adev->irq, apbt_interrupt_handler, 322 if (request_irq(adev->irq, apbt_interrupt_handler,
@@ -508,64 +508,12 @@ static int apbt_next_event(unsigned long delta,
508 return 0; 508 return 0;
509} 509}
510 510
511/*
512 * APB timer clock is not in sync with pclk on Langwell, which translates to
513 * unreliable read value caused by sampling error. the error does not add up
514 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
515 * would go backwards. the following code is trying to prevent time traveling
516 * backwards. little bit paranoid.
517 */
518static cycle_t apbt_read_clocksource(struct clocksource *cs) 511static cycle_t apbt_read_clocksource(struct clocksource *cs)
519{ 512{
520 unsigned long t0, t1, t2; 513 unsigned long current_count;
521 static unsigned long last_read; 514
522 515 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
523bad_count: 516 return (cycle_t)~current_count;
524 t1 = apbt_readl(phy_cs_timer_id,
525 APBTMR_N_CURRENT_VALUE);
526 t2 = apbt_readl(phy_cs_timer_id,
527 APBTMR_N_CURRENT_VALUE);
528 if (unlikely(t1 < t2)) {
529 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
530 t1, t2, t2 - t1);
531 goto bad_count;
532 }
533 /*
534 * check against cached last read, makes sure time does not go back.
535 * it could be a normal rollover but we will do tripple check anyway
536 */
537 if (unlikely(t2 > last_read)) {
538 /* check if we have a normal rollover */
539 unsigned long raw_intr_status =
540 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
541 /*
542 * cs timer interrupt is masked but raw intr bit is set if
543 * rollover occurs. then we read EOI reg to clear it.
544 */
545 if (raw_intr_status & (1 << phy_cs_timer_id)) {
546 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
547 goto out;
548 }
549 pr_debug("APB CS going back %lx:%lx:%lx ",
550 t2, last_read, t2 - last_read);
551bad_count_x3:
552 pr_debug("triple check enforced\n");
553 t0 = apbt_readl(phy_cs_timer_id,
554 APBTMR_N_CURRENT_VALUE);
555 udelay(1);
556 t1 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE);
558 udelay(1);
559 t2 = apbt_readl(phy_cs_timer_id,
560 APBTMR_N_CURRENT_VALUE);
561 if ((t2 > t1) || (t1 > t0)) {
562 printk(KERN_ERR "Error: APB CS tripple check failed\n");
563 goto bad_count_x3;
564 }
565 }
566out:
567 last_read = t2;
568 return (cycle_t)~t2;
569} 517}
570 518
571static int apbt_clocksource_register(void) 519static int apbt_clocksource_register(void)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5955a7800a96..73fb469908c6 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
57static u32 __init allocate_aperture(void) 57static u32 __init allocate_aperture(void)
58{ 58{
59 u32 aper_size; 59 u32 aper_size;
60 void *p; 60 unsigned long addr;
61 61
62 /* aper_size should <= 1G */ 62 /* aper_size should <= 1G */
63 if (fallback_aper_order > 5) 63 if (fallback_aper_order > 5)
@@ -73,7 +73,7 @@ static u32 __init allocate_aperture(void)
73 /* 73 /*
74 * using 512M as goal, in case kexec will load kernel_big 74 * using 512M as goal, in case kexec will load kernel_big
75 * that will do the on position decompress, and could overlap with 75 * that will do the on position decompress, and could overlap with
76 * that positon with gart that is used. 76 * that position with gart that is used.
77 * sequende: 77 * sequende:
78 * kernel_small 78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
@@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void)
83 * so don't use 512M below as gart iommu, leave the space for kernel 83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe 84 * code for safe
85 */ 85 */
86 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); 86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10);
91 return 0;
92 }
93 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
87 /* 94 /*
88 * Kmemleak should not scan this block as it may not be mapped via the 95 * Kmemleak should not scan this block as it may not be mapped via the
89 * kernel direct mapping. 96 * kernel direct mapping.
90 */ 97 */
91 kmemleak_ignore(p); 98 kmemleak_ignore(phys_to_virt(addr));
92 if (!p || __pa(p)+aper_size > 0xffffffff) {
93 printk(KERN_ERR
94 "Cannot allocate aperture memory hole (%p,%uK)\n",
95 p, aper_size>>10);
96 if (p)
97 free_bootmem(__pa(p), aper_size);
98 return 0;
99 }
100 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 99 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
101 aper_size >> 10, __pa(p)); 100 aper_size >> 10, addr);
102 insert_aperture_resource((u32)__pa(p), aper_size); 101 insert_aperture_resource((u32)addr, aper_size);
103 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 102 register_nosave_region(addr >> PAGE_SHIFT,
104 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 103 (addr+aper_size) >> PAGE_SHIFT);
105 104
106 return (u32)__pa(p); 105 return (u32)addr;
107} 106}
108 107
109 108
@@ -500,7 +499,7 @@ out:
500 * Don't enable translation yet but enable GART IO and CPU 499 * Don't enable translation yet but enable GART IO and CPU
501 * accesses and set DISTLBWALKPRB since GART table memory is UC. 500 * accesses and set DISTLBWALKPRB since GART table memory is UC.
502 */ 501 */
503 u32 ctl = DISTLBWALKPRB | aper_order << 1; 502 u32 ctl = aper_order << 1;
504 503
505 bus = amd_nb_bus_dev_ranges[i].bus; 504 bus = amd_nb_bus_dev_ranges[i].bus;
506 dev_base = amd_nb_bus_dev_ranges[i].dev_base; 505 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 06c196d7e59c..fabf01eff771 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,7 +24,7 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/sysdev.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/dmar.h> 30#include <linux/dmar.h>
@@ -43,6 +43,7 @@
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
46#include <asm/desc.h> 47#include <asm/desc.h>
47#include <asm/hpet.h> 48#include <asm/hpet.h>
48#include <asm/idle.h> 49#include <asm/idle.h>
@@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
79 80
80#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
81/* 91/*
82 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
83 * 93 *
84 * +1=force-enable 94 * +1=force-enable
85 */ 95 */
86static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
87/* 97/*
88 * APIC command line parameters 98 * APIC command line parameters
89 */ 99 */
@@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
153unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
154int disable_apic; 164int disable_apic;
155/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
156static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
157/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
158int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
159EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -177,29 +187,8 @@ static struct resource lapic_resource = {
177 187
178static unsigned int calibration_result; 188static unsigned int calibration_result;
179 189
180static int lapic_next_event(unsigned long delta,
181 struct clock_event_device *evt);
182static void lapic_timer_setup(enum clock_event_mode mode,
183 struct clock_event_device *evt);
184static void lapic_timer_broadcast(const struct cpumask *mask);
185static void apic_pm_activate(void); 190static void apic_pm_activate(void);
186 191
187/*
188 * The local apic timer can be used for any function which is CPU local.
189 */
190static struct clock_event_device lapic_clockevent = {
191 .name = "lapic",
192 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
193 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
194 .shift = 32,
195 .set_mode = lapic_timer_setup,
196 .set_next_event = lapic_next_event,
197 .broadcast = lapic_timer_broadcast,
198 .rating = 100,
199 .irq = -1,
200};
201static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
202
203static unsigned long apic_phys; 192static unsigned long apic_phys;
204 193
205/* 194/*
@@ -238,7 +227,7 @@ static int modern_apic(void)
238 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
239 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
240 */ 229 */
241void apic_disable(void) 230static void __init apic_disable(void)
242{ 231{
243 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
244 apic = &apic_noop; 233 apic = &apic_noop;
@@ -282,23 +271,6 @@ u64 native_apic_icr_read(void)
282 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
283} 272}
284 273
285/**
286 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
287 */
288void __cpuinit enable_NMI_through_LVT0(void)
289{
290 unsigned int v;
291
292 /* unmask and set to NMI */
293 v = APIC_DM_NMI;
294
295 /* Level triggered for 82489DX (32bit mode) */
296 if (!lapic_is_integrated())
297 v |= APIC_LVT_LEVEL_TRIGGER;
298
299 apic_write(APIC_LVT0, v);
300}
301
302#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
303/** 275/**
304 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
508#endif 480#endif
509} 481}
510 482
483
484/*
485 * The local apic timer can be used for any function which is CPU local.
486 */
487static struct clock_event_device lapic_clockevent = {
488 .name = "lapic",
489 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
490 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
491 .shift = 32,
492 .set_mode = lapic_timer_setup,
493 .set_next_event = lapic_next_event,
494 .broadcast = lapic_timer_broadcast,
495 .rating = 100,
496 .irq = -1,
497};
498static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
499
511/* 500/*
512 * Setup the local APIC timer for this CPU. Copy the initialized values 501 * Setup the local APIC timer for this CPU. Copy the initialized values
513 * of the boot CPU and register the clock event in the framework. 502 * of the boot CPU and register the clock event in the framework.
@@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void)
1209 rdtscll(tsc); 1198 rdtscll(tsc);
1210 1199
1211 if (disable_apic) { 1200 if (disable_apic) {
1212 arch_disable_smp_support(); 1201 disable_ioapic_support();
1213 return; 1202 return;
1214 } 1203 }
1215 1204
@@ -1237,6 +1226,19 @@ void __cpuinit setup_local_APIC(void)
1237 */ 1226 */
1238 apic->init_apic_ldr(); 1227 apic->init_apic_ldr();
1239 1228
1229#ifdef CONFIG_X86_32
1230 /*
1231 * APIC LDR is initialized. If logical_apicid mapping was
1232 * initialized during get_smp_config(), make sure it matches the
1233 * actual value.
1234 */
1235 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1236 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id();
1240#endif
1241
1240 /* 1242 /*
1241 * Set Task Priority to 'accept all'. We never change this 1243 * Set Task Priority to 'accept all'. We never change this
1242 * later on. 1244 * later on.
@@ -1381,12 +1383,17 @@ void __cpuinit end_local_APIC_setup(void)
1381#endif 1383#endif
1382 1384
1383 apic_pm_activate(); 1385 apic_pm_activate();
1386}
1387
1388void __init bsp_end_local_APIC_setup(void)
1389{
1390 end_local_APIC_setup();
1384 1391
1385 /* 1392 /*
1386 * Now that local APIC setup is completed for BP, configure the fault 1393 * Now that local APIC setup is completed for BP, configure the fault
1387 * handling for interrupt remapping. 1394 * handling for interrupt remapping.
1388 */ 1395 */
1389 if (!smp_processor_id() && intr_remapping_enabled) 1396 if (intr_remapping_enabled)
1390 enable_drhd_fault_handling(); 1397 enable_drhd_fault_handling();
1391 1398
1392} 1399}
@@ -1443,7 +1450,7 @@ int __init enable_IR(void)
1443void __init enable_IR_x2apic(void) 1450void __init enable_IR_x2apic(void)
1444{ 1451{
1445 unsigned long flags; 1452 unsigned long flags;
1446 struct IO_APIC_route_entry **ioapic_entries = NULL; 1453 struct IO_APIC_route_entry **ioapic_entries;
1447 int ret, x2apic_enabled = 0; 1454 int ret, x2apic_enabled = 0;
1448 int dmar_table_init_ret; 1455 int dmar_table_init_ret;
1449 1456
@@ -1532,7 +1539,7 @@ static int __init detect_init_APIC(void)
1532} 1539}
1533#else 1540#else
1534 1541
1535static int apic_verify(void) 1542static int __init apic_verify(void)
1536{ 1543{
1537 u32 features, h, l; 1544 u32 features, h, l;
1538 1545
@@ -1557,7 +1564,7 @@ static int apic_verify(void)
1557 return 0; 1564 return 0;
1558} 1565}
1559 1566
1560int apic_force_enable(void) 1567int __init apic_force_enable(unsigned long addr)
1561{ 1568{
1562 u32 h, l; 1569 u32 h, l;
1563 1570
@@ -1573,7 +1580,7 @@ int apic_force_enable(void)
1573 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1580 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1574 pr_info("Local APIC disabled by BIOS -- reenabling.\n"); 1581 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1575 l &= ~MSR_IA32_APICBASE_BASE; 1582 l &= ~MSR_IA32_APICBASE_BASE;
1576 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1583 l |= MSR_IA32_APICBASE_ENABLE | addr;
1577 wrmsr(MSR_IA32_APICBASE, l, h); 1584 wrmsr(MSR_IA32_APICBASE, l, h);
1578 enabled_via_apicbase = 1; 1585 enabled_via_apicbase = 1;
1579 } 1586 }
@@ -1614,7 +1621,7 @@ static int __init detect_init_APIC(void)
1614 "you can enable it with \"lapic\"\n"); 1621 "you can enable it with \"lapic\"\n");
1615 return -1; 1622 return -1;
1616 } 1623 }
1617 if (apic_force_enable()) 1624 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1618 return -1; 1625 return -1;
1619 } else { 1626 } else {
1620 if (apic_verify()) 1627 if (apic_verify())
@@ -1756,7 +1763,7 @@ int __init APIC_init_uniprocessor(void)
1756 enable_IO_APIC(); 1763 enable_IO_APIC();
1757#endif 1764#endif
1758 1765
1759 end_local_APIC_setup(); 1766 bsp_end_local_APIC_setup();
1760 1767
1761#ifdef CONFIG_X86_IO_APIC 1768#ifdef CONFIG_X86_IO_APIC
1762 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1769 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1925,17 +1932,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1925{ 1932{
1926 int cpu; 1933 int cpu;
1927 1934
1928 /*
1929 * Validate version
1930 */
1931 if (version == 0x0) {
1932 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1933 "fixing up to 0x10. (tell your hw vendor)\n",
1934 version);
1935 version = 0x10;
1936 }
1937 apic_version[apicid] = version;
1938
1939 if (num_processors >= nr_cpu_ids) { 1935 if (num_processors >= nr_cpu_ids) {
1940 int max = nr_cpu_ids; 1936 int max = nr_cpu_ids;
1941 int thiscpu = max + disabled_cpus; 1937 int thiscpu = max + disabled_cpus;
@@ -1949,22 +1945,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1949 } 1945 }
1950 1946
1951 num_processors++; 1947 num_processors++;
1952 cpu = cpumask_next_zero(-1, cpu_present_mask);
1953
1954 if (version != apic_version[boot_cpu_physical_apicid])
1955 WARN_ONCE(1,
1956 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1957 apic_version[boot_cpu_physical_apicid], cpu, version);
1958
1959 physid_set(apicid, phys_cpu_present_map);
1960 if (apicid == boot_cpu_physical_apicid) { 1948 if (apicid == boot_cpu_physical_apicid) {
1961 /* 1949 /*
1962 * x86_bios_cpu_apicid is required to have processors listed 1950 * x86_bios_cpu_apicid is required to have processors listed
1963 * in same order as logical cpu numbers. Hence the first 1951 * in same order as logical cpu numbers. Hence the first
1964 * entry is BSP, and so on. 1952 * entry is BSP, and so on.
1953 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1954 * for BSP.
1965 */ 1955 */
1966 cpu = 0; 1956 cpu = 0;
1957 } else
1958 cpu = cpumask_next_zero(-1, cpu_present_mask);
1959
1960 /*
1961 * Validate version
1962 */
1963 if (version == 0x0) {
1964 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1965 cpu, apicid);
1966 version = 0x10;
1967 } 1967 }
1968 apic_version[apicid] = version;
1969
1970 if (version != apic_version[boot_cpu_physical_apicid]) {
1971 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1972 apic_version[boot_cpu_physical_apicid], cpu, version);
1973 }
1974
1975 physid_set(apicid, phys_cpu_present_map);
1968 if (apicid > max_physical_apicid) 1976 if (apicid > max_physical_apicid)
1969 max_physical_apicid = apicid; 1977 max_physical_apicid = apicid;
1970 1978
@@ -1972,7 +1980,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1972 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1980 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1973 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1981 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1974#endif 1982#endif
1975 1983#ifdef CONFIG_X86_32
1984 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1985 apic->x86_32_early_logical_apicid(cpu);
1986#endif
1976 set_cpu_possible(cpu, true); 1987 set_cpu_possible(cpu, true);
1977 set_cpu_present(cpu, true); 1988 set_cpu_present(cpu, true);
1978} 1989}
@@ -1993,10 +2004,14 @@ void default_init_apic_ldr(void)
1993} 2004}
1994 2005
1995#ifdef CONFIG_X86_32 2006#ifdef CONFIG_X86_32
1996int default_apicid_to_node(int logical_apicid) 2007int default_x86_32_numa_cpu_node(int cpu)
1997{ 2008{
1998#ifdef CONFIG_SMP 2009#ifdef CONFIG_NUMA
1999 return apicid_2_node[hard_smp_processor_id()]; 2010 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2011
2012 if (apicid != BAD_APICID)
2013 return __apicid_to_node[apicid];
2014 return NUMA_NO_NODE;
2000#else 2015#else
2001 return 0; 2016 return 0;
2002#endif 2017#endif
@@ -2031,7 +2046,7 @@ static struct {
2031 unsigned int apic_thmr; 2046 unsigned int apic_thmr;
2032} apic_pm_state; 2047} apic_pm_state;
2033 2048
2034static int lapic_suspend(struct sys_device *dev, pm_message_t state) 2049static int lapic_suspend(void)
2035{ 2050{
2036 unsigned long flags; 2051 unsigned long flags;
2037 int maxlvt; 2052 int maxlvt;
@@ -2069,23 +2084,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2069 return 0; 2084 return 0;
2070} 2085}
2071 2086
2072static int lapic_resume(struct sys_device *dev) 2087static void lapic_resume(void)
2073{ 2088{
2074 unsigned int l, h; 2089 unsigned int l, h;
2075 unsigned long flags; 2090 unsigned long flags;
2076 int maxlvt; 2091 int maxlvt, ret;
2077 int ret = 0;
2078 struct IO_APIC_route_entry **ioapic_entries = NULL; 2092 struct IO_APIC_route_entry **ioapic_entries = NULL;
2079 2093
2080 if (!apic_pm_state.active) 2094 if (!apic_pm_state.active)
2081 return 0; 2095 return;
2082 2096
2083 local_irq_save(flags); 2097 local_irq_save(flags);
2084 if (intr_remapping_enabled) { 2098 if (intr_remapping_enabled) {
2085 ioapic_entries = alloc_ioapic_entries(); 2099 ioapic_entries = alloc_ioapic_entries();
2086 if (!ioapic_entries) { 2100 if (!ioapic_entries) {
2087 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2101 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
2088 ret = -ENOMEM;
2089 goto restore; 2102 goto restore;
2090 } 2103 }
2091 2104
@@ -2147,8 +2160,6 @@ static int lapic_resume(struct sys_device *dev)
2147 } 2160 }
2148restore: 2161restore:
2149 local_irq_restore(flags); 2162 local_irq_restore(flags);
2150
2151 return ret;
2152} 2163}
2153 2164
2154/* 2165/*
@@ -2156,17 +2167,11 @@ restore:
2156 * are needed on every CPU up until machine_halt/restart/poweroff. 2167 * are needed on every CPU up until machine_halt/restart/poweroff.
2157 */ 2168 */
2158 2169
2159static struct sysdev_class lapic_sysclass = { 2170static struct syscore_ops lapic_syscore_ops = {
2160 .name = "lapic",
2161 .resume = lapic_resume, 2171 .resume = lapic_resume,
2162 .suspend = lapic_suspend, 2172 .suspend = lapic_suspend,
2163}; 2173};
2164 2174
2165static struct sys_device device_lapic = {
2166 .id = 0,
2167 .cls = &lapic_sysclass,
2168};
2169
2170static void __cpuinit apic_pm_activate(void) 2175static void __cpuinit apic_pm_activate(void)
2171{ 2176{
2172 apic_pm_state.active = 1; 2177 apic_pm_state.active = 1;
@@ -2174,16 +2179,11 @@ static void __cpuinit apic_pm_activate(void)
2174 2179
2175static int __init init_lapic_sysfs(void) 2180static int __init init_lapic_sysfs(void)
2176{ 2181{
2177 int error;
2178
2179 if (!cpu_has_apic)
2180 return 0;
2181 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 2182 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
2183 if (cpu_has_apic)
2184 register_syscore_ops(&lapic_syscore_ops);
2182 2185
2183 error = sysdev_class_register(&lapic_sysclass); 2186 return 0;
2184 if (!error)
2185 error = sysdev_register(&device_lapic);
2186 return error;
2187} 2187}
2188 2188
2189/* local apic needs to resume before other devices access its registers. */ 2189/* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..5652d31fe108 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,8 +185,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 185 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 186 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 187 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 188 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 189 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 190 .setup_portio_remap = NULL,
@@ -337,8 +335,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 335 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 336 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 337 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 338 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 339 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 340 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..f1baa2dc087a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v)
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131} 120}
132 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
133struct apic apic_noop = { 130struct apic apic_noop = {
134 .name = "noop", 131 .name = "noop",
135 .probe = noop_probe, 132 .probe = noop_probe,
@@ -153,9 +150,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 150 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 151 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 152 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 153
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 154 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 155 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 156
@@ -197,4 +192,9 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 192 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 193 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 194 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
195
196#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif
200}; 200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..541a2e431659 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,7 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
259}; 257};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..3e9de4854c5b 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid) 513static int es7000_numa_cpu_node(int cpu)
508{ 514{
509 return 0; 515 return 0;
510} 516}
511 517
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 518static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 519{
515 if (!mps_cpu) 520 if (!mps_cpu)
@@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 533 ++cpu_id;
529} 534}
530 535
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 536static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 537{
545 /* For clustered we don't have a good way to do this yet - hack */ 538 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 554 * The cpus in the mask must all be on the apic cluster.
562 */ 555 */
563 for_each_cpu(cpu, cpumask) { 556 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 557 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 558
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 559 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 560 WARN(1, "Not a valid mask!");
@@ -578,7 +571,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 571es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 572 const struct cpumask *andmask)
580{ 573{
581 int apicid = es7000_cpu_to_logical_apicid(0); 574 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 575 cpumask_var_t cpumask;
583 576
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 577 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 648 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 649 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 650 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 651 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 652 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 653 .setup_portio_remap = NULL,
@@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 686 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 687 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
698}; 692};
699 693
700struct apic __refdata apic_es7000 = { 694struct apic __refdata apic_es7000 = {
@@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 714 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 715 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 716 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 717 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 718 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 719 .setup_portio_remap = NULL,
@@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 750 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 751 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
761}; 756};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79fd43ca6f96..5260fe91bcb6 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,6 +16,7 @@
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/delay.h>
19 20
20#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
21u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(void)
@@ -83,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
83 arch_spin_lock(&lock); 84 arch_spin_lock(&lock);
84 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 85 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
85 show_regs(regs); 86 show_regs(regs);
86 dump_stack();
87 arch_spin_unlock(&lock); 87 arch_spin_unlock(&lock);
88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
89 return NOTIFY_STOP; 89 return NOTIFY_STOP;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 697dc34b7b87..68df09bba92e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/sysdev.h> 33#include <linux/syscore_ops.h>
34#include <linux/msi.h> 34#include <linux/msi.h>
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
@@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
108 108
109int skip_ioapic_setup; 109int skip_ioapic_setup;
110 110
111void arch_disable_smp_support(void) 111/**
112 * disable_ioapic_support() - disables ioapic support at runtime
113 */
114void disable_ioapic_support(void)
112{ 115{
113#ifdef CONFIG_PCI 116#ifdef CONFIG_PCI
114 noioapicquirk = 1; 117 noioapicquirk = 1;
@@ -120,11 +123,14 @@ void arch_disable_smp_support(void)
120static int __init parse_noapic(char *str) 123static int __init parse_noapic(char *str)
121{ 124{
122 /* disable IO-APIC */ 125 /* disable IO-APIC */
123 arch_disable_smp_support(); 126 disable_ioapic_support();
124 return 0; 127 return 0;
125} 128}
126early_param("noapic", parse_noapic); 129early_param("noapic", parse_noapic);
127 130
131static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
132 struct io_apic_irq_attr *attr);
133
128/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ 134/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
129void mp_save_irq(struct mpc_intsrc *m) 135void mp_save_irq(struct mpc_intsrc *m)
130{ 136{
@@ -181,7 +187,7 @@ int __init arch_early_irq_init(void)
181 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); 187 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
182 188
183 for (i = 0; i < count; i++) { 189 for (i = 0; i < count; i++) {
184 set_irq_chip_data(i, &cfg[i]); 190 irq_set_chip_data(i, &cfg[i]);
185 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); 191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
186 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
187 /* 193 /*
@@ -200,7 +206,7 @@ int __init arch_early_irq_init(void)
200#ifdef CONFIG_SPARSE_IRQ 206#ifdef CONFIG_SPARSE_IRQ
201static struct irq_cfg *irq_cfg(unsigned int irq) 207static struct irq_cfg *irq_cfg(unsigned int irq)
202{ 208{
203 return get_irq_chip_data(irq); 209 return irq_get_chip_data(irq);
204} 210}
205 211
206static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) 212static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
@@ -226,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
226{ 232{
227 if (!cfg) 233 if (!cfg)
228 return; 234 return;
229 set_irq_chip_data(at, NULL); 235 irq_set_chip_data(at, NULL);
230 free_cpumask_var(cfg->domain); 236 free_cpumask_var(cfg->domain);
231 free_cpumask_var(cfg->old_domain); 237 free_cpumask_var(cfg->old_domain);
232 kfree(cfg); 238 kfree(cfg);
@@ -256,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
256 if (res < 0) { 262 if (res < 0) {
257 if (res != -EEXIST) 263 if (res != -EEXIST)
258 return NULL; 264 return NULL;
259 cfg = get_irq_chip_data(at); 265 cfg = irq_get_chip_data(at);
260 if (cfg) 266 if (cfg)
261 return cfg; 267 return cfg;
262 } 268 }
263 269
264 cfg = alloc_irq_cfg(at, node); 270 cfg = alloc_irq_cfg(at, node);
265 if (cfg) 271 if (cfg)
266 set_irq_chip_data(at, cfg); 272 irq_set_chip_data(at, cfg);
267 else 273 else
268 irq_free_desc(at); 274 irq_free_desc(at);
269 return cfg; 275 return cfg;
@@ -818,7 +824,7 @@ static int EISA_ELCR(unsigned int irq)
818#define default_MCA_trigger(idx) (1) 824#define default_MCA_trigger(idx) (1)
819#define default_MCA_polarity(idx) default_ISA_polarity(idx) 825#define default_MCA_polarity(idx) default_ISA_polarity(idx)
820 826
821static int MPBIOS_polarity(int idx) 827static int irq_polarity(int idx)
822{ 828{
823 int bus = mp_irqs[idx].srcbus; 829 int bus = mp_irqs[idx].srcbus;
824 int polarity; 830 int polarity;
@@ -860,7 +866,7 @@ static int MPBIOS_polarity(int idx)
860 return polarity; 866 return polarity;
861} 867}
862 868
863static int MPBIOS_trigger(int idx) 869static int irq_trigger(int idx)
864{ 870{
865 int bus = mp_irqs[idx].srcbus; 871 int bus = mp_irqs[idx].srcbus;
866 int trigger; 872 int trigger;
@@ -932,16 +938,6 @@ static int MPBIOS_trigger(int idx)
932 return trigger; 938 return trigger;
933} 939}
934 940
935static inline int irq_polarity(int idx)
936{
937 return MPBIOS_polarity(idx);
938}
939
940static inline int irq_trigger(int idx)
941{
942 return MPBIOS_trigger(idx);
943}
944
945static int pin_2_irq(int idx, int apic, int pin) 941static int pin_2_irq(int idx, int apic, int pin)
946{ 942{
947 int irq; 943 int irq;
@@ -1189,7 +1185,7 @@ void __setup_vector_irq(int cpu)
1189 raw_spin_lock(&vector_lock); 1185 raw_spin_lock(&vector_lock);
1190 /* Mark the inuse vectors */ 1186 /* Mark the inuse vectors */
1191 for_each_active_irq(irq) { 1187 for_each_active_irq(irq) {
1192 cfg = get_irq_chip_data(irq); 1188 cfg = irq_get_chip_data(irq);
1193 if (!cfg) 1189 if (!cfg)
1194 continue; 1190 continue;
1195 /* 1191 /*
@@ -1220,10 +1216,6 @@ void __setup_vector_irq(int cpu)
1220static struct irq_chip ioapic_chip; 1216static struct irq_chip ioapic_chip;
1221static struct irq_chip ir_ioapic_chip; 1217static struct irq_chip ir_ioapic_chip;
1222 1218
1223#define IOAPIC_AUTO -1
1224#define IOAPIC_EDGE 0
1225#define IOAPIC_LEVEL 1
1226
1227#ifdef CONFIG_X86_32 1219#ifdef CONFIG_X86_32
1228static inline int IO_APIC_irq_trigger(int irq) 1220static inline int IO_APIC_irq_trigger(int irq)
1229{ 1221{
@@ -1248,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq)
1248} 1240}
1249#endif 1241#endif
1250 1242
1251static void ioapic_register_intr(unsigned int irq, unsigned long trigger) 1243static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1244 unsigned long trigger)
1252{ 1245{
1246 struct irq_chip *chip = &ioapic_chip;
1247 irq_flow_handler_t hdl;
1248 bool fasteoi;
1253 1249
1254 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1255 trigger == IOAPIC_LEVEL) 1251 trigger == IOAPIC_LEVEL) {
1256 irq_set_status_flags(irq, IRQ_LEVEL); 1252 irq_set_status_flags(irq, IRQ_LEVEL);
1257 else 1253 fasteoi = true;
1254 } else {
1258 irq_clear_status_flags(irq, IRQ_LEVEL); 1255 irq_clear_status_flags(irq, IRQ_LEVEL);
1256 fasteoi = false;
1257 }
1259 1258
1260 if (irq_remapped(get_irq_chip_data(irq))) { 1259 if (irq_remapped(cfg)) {
1261 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 1260 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1262 if (trigger) 1261 chip = &ir_ioapic_chip;
1263 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1262 fasteoi = trigger != 0;
1264 handle_fasteoi_irq,
1265 "fasteoi");
1266 else
1267 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1268 handle_edge_irq, "edge");
1269 return;
1270 } 1263 }
1271 1264
1272 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1265 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1273 trigger == IOAPIC_LEVEL) 1266 irq_set_chip_and_handler_name(irq, chip, hdl,
1274 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1267 fasteoi ? "fasteoi" : "edge");
1275 handle_fasteoi_irq,
1276 "fasteoi");
1277 else
1278 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1279 handle_edge_irq, "edge");
1280} 1268}
1281 1269
1282static int setup_ioapic_entry(int apic_id, int irq, 1270static int setup_ioapic_entry(int apic_id, int irq,
@@ -1374,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1374 return; 1362 return;
1375 } 1363 }
1376 1364
1377 ioapic_register_intr(irq, trigger); 1365 ioapic_register_intr(irq, cfg, trigger);
1378 if (irq < legacy_pic->nr_legacy_irqs) 1366 if (irq < legacy_pic->nr_legacy_irqs)
1379 legacy_pic->mask(irq); 1367 legacy_pic->mask(irq);
1380 1368
@@ -1385,33 +1373,26 @@ static struct {
1385 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1386} mp_ioapic_routing[MAX_IO_APICS]; 1374} mp_ioapic_routing[MAX_IO_APICS];
1387 1375
1388static void __init setup_IO_APIC_irqs(void) 1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1389{ 1377{
1390 int apic_id, pin, idx, irq, notcon = 0; 1378 if (idx != -1)
1391 int node = cpu_to_node(0); 1379 return false;
1392 struct irq_cfg *cfg;
1393 1380
1394 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin);
1383 return true;
1384}
1385
1386static void __init __io_apic_setup_irqs(unsigned int apic_id)
1387{
1388 int idx, node = cpu_to_node(0);
1389 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq;
1395 1391
1396 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1397 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1398 idx = find_irq_entry(apic_id, pin, mp_INT); 1393 idx = find_irq_entry(apic_id, pin, mp_INT);
1399 if (idx == -1) { 1394 if (io_apic_pin_not_connected(idx, apic_id, pin))
1400 if (!notcon) {
1401 notcon = 1;
1402 apic_printk(APIC_VERBOSE,
1403 KERN_DEBUG " %d-%d",
1404 mp_ioapics[apic_id].apicid, pin);
1405 } else
1406 apic_printk(APIC_VERBOSE, " %d-%d",
1407 mp_ioapics[apic_id].apicid, pin);
1408 continue; 1395 continue;
1409 }
1410 if (notcon) {
1411 apic_printk(APIC_VERBOSE,
1412 " (apicid-pin) not connected\n");
1413 notcon = 0;
1414 }
1415 1396
1416 irq = pin_2_irq(idx, apic_id, pin); 1397 irq = pin_2_irq(idx, apic_id, pin);
1417 1398
@@ -1423,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void)
1423 * installed and if it returns 1: 1404 * installed and if it returns 1:
1424 */ 1405 */
1425 if (apic->multi_timer_check && 1406 if (apic->multi_timer_check &&
1426 apic->multi_timer_check(apic_id, irq)) 1407 apic->multi_timer_check(apic_id, irq))
1427 continue; 1408 continue;
1428 1409
1429 cfg = alloc_irq_and_cfg_at(irq, node); 1410 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1430 if (!cfg) 1411 irq_polarity(idx));
1431 continue;
1432 1412
1433 add_pin_to_irq_node(cfg, node, apic_id, pin); 1413 io_apic_setup_irq_pin(irq, node, &attr);
1434 /*
1435 * don't mark it in pin_programmed, so later acpi could
1436 * set it correctly when irq < 16
1437 */
1438 setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
1439 irq_polarity(idx));
1440 } 1414 }
1415}
1441 1416
1442 if (notcon) 1417static void __init setup_IO_APIC_irqs(void)
1443 apic_printk(APIC_VERBOSE, 1418{
1444 " (apicid-pin) not connected\n"); 1419 unsigned int apic_id;
1420
1421 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1422
1423 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1424 __io_apic_setup_irqs(apic_id);
1445} 1425}
1446 1426
1447/* 1427/*
@@ -1452,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void)
1452void setup_IO_APIC_irq_extra(u32 gsi) 1432void setup_IO_APIC_irq_extra(u32 gsi)
1453{ 1433{
1454 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); 1434 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1455 struct irq_cfg *cfg; 1435 struct io_apic_irq_attr attr;
1456 1436
1457 /* 1437 /*
1458 * Convert 'gsi' to 'ioapic.pin'. 1438 * Convert 'gsi' to 'ioapic.pin'.
@@ -1472,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1472 if (apic_id == 0 || irq < NR_IRQS_LEGACY) 1452 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1473 return; 1453 return;
1474 1454
1475 cfg = alloc_irq_and_cfg_at(irq, node); 1455 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1476 if (!cfg) 1456 irq_polarity(idx));
1477 return;
1478
1479 add_pin_to_irq_node(cfg, node, apic_id, pin);
1480
1481 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1482 pr_debug("Pin %d-%d already programmed\n",
1483 mp_ioapics[apic_id].apicid, pin);
1484 return;
1485 }
1486 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1487 1457
1488 setup_ioapic_irq(apic_id, pin, irq, cfg, 1458 io_apic_setup_irq_pin_once(irq, node, &attr);
1489 irq_trigger(idx), irq_polarity(idx));
1490} 1459}
1491 1460
1492/* 1461/*
@@ -1518,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1518 * The timer IRQ doesn't have to know that behind the 1487 * The timer IRQ doesn't have to know that behind the
1519 * scene we may have a 8259A-master in AEOI mode ... 1488 * scene we may have a 8259A-master in AEOI mode ...
1520 */ 1489 */
1521 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1490 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1491 "edge");
1522 1492
1523 /* 1493 /*
1524 * Add it to the IO-APIC irq-routing table: 1494 * Add it to the IO-APIC irq-routing table:
@@ -1625,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1625 for_each_active_irq(irq) { 1595 for_each_active_irq(irq) {
1626 struct irq_pin_list *entry; 1596 struct irq_pin_list *entry;
1627 1597
1628 cfg = get_irq_chip_data(irq); 1598 cfg = irq_get_chip_data(irq);
1629 if (!cfg) 1599 if (!cfg)
1630 continue; 1600 continue;
1631 entry = cfg->irq_2_pin; 1601 entry = cfg->irq_2_pin;
@@ -1916,7 +1886,7 @@ void disable_IO_APIC(void)
1916 * 1886 *
1917 * With interrupt-remapping, for now we will use virtual wire A mode, 1887 * With interrupt-remapping, for now we will use virtual wire A mode,
1918 * as virtual wire B is little complex (need to configure both 1888 * as virtual wire B is little complex (need to configure both
1919 * IOAPIC RTE aswell as interrupt-remapping table entry). 1889 * IOAPIC RTE as well as interrupt-remapping table entry).
1920 * As this gets called during crash dump, keep this simple for now. 1890 * As this gets called during crash dump, keep this simple for now.
1921 */ 1891 */
1922 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1892 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2391,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
2391 2361
2392void irq_force_complete_move(int irq) 2362void irq_force_complete_move(int irq)
2393{ 2363{
2394 struct irq_cfg *cfg = get_irq_chip_data(irq); 2364 struct irq_cfg *cfg = irq_get_chip_data(irq);
2395 2365
2396 if (!cfg) 2366 if (!cfg)
2397 return; 2367 return;
@@ -2405,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { }
2405static void ack_apic_edge(struct irq_data *data) 2375static void ack_apic_edge(struct irq_data *data)
2406{ 2376{
2407 irq_complete_move(data->chip_data); 2377 irq_complete_move(data->chip_data);
2408 move_native_irq(data->irq); 2378 irq_move_irq(data);
2409 ack_APIC_irq(); 2379 ack_APIC_irq();
2410} 2380}
2411 2381
@@ -2462,7 +2432,7 @@ static void ack_apic_level(struct irq_data *data)
2462 irq_complete_move(cfg); 2432 irq_complete_move(cfg);
2463#ifdef CONFIG_GENERIC_PENDING_IRQ 2433#ifdef CONFIG_GENERIC_PENDING_IRQ
2464 /* If we are moving the irq we need to mask it */ 2434 /* If we are moving the irq we need to mask it */
2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2435 if (unlikely(irqd_is_setaffinity_pending(data))) {
2466 do_unmask_irq = 1; 2436 do_unmask_irq = 1;
2467 mask_ioapic(cfg); 2437 mask_ioapic(cfg);
2468 } 2438 }
@@ -2551,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data)
2551 * and you can go talk to the chipset vendor about it. 2521 * and you can go talk to the chipset vendor about it.
2552 */ 2522 */
2553 if (!io_apic_level_ack_pending(cfg)) 2523 if (!io_apic_level_ack_pending(cfg))
2554 move_masked_irq(irq); 2524 irq_move_masked_irq(data);
2555 unmask_ioapic(cfg); 2525 unmask_ioapic(cfg);
2556 } 2526 }
2557} 2527}
@@ -2614,7 +2584,7 @@ static inline void init_IO_APIC_traps(void)
2614 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2584 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2615 */ 2585 */
2616 for_each_active_irq(irq) { 2586 for_each_active_irq(irq) {
2617 cfg = get_irq_chip_data(irq); 2587 cfg = irq_get_chip_data(irq);
2618 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2588 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2619 /* 2589 /*
2620 * Hmm.. We don't have an entry for this, 2590 * Hmm.. We don't have an entry for this,
@@ -2625,7 +2595,7 @@ static inline void init_IO_APIC_traps(void)
2625 legacy_pic->make_irq(irq); 2595 legacy_pic->make_irq(irq);
2626 else 2596 else
2627 /* Strange. Oh, well.. */ 2597 /* Strange. Oh, well.. */
2628 set_irq_chip(irq, &no_irq_chip); 2598 irq_set_chip(irq, &no_irq_chip);
2629 } 2599 }
2630 } 2600 }
2631} 2601}
@@ -2665,7 +2635,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2665static void lapic_register_intr(int irq) 2635static void lapic_register_intr(int irq)
2666{ 2636{
2667 irq_clear_status_flags(irq, IRQ_LEVEL); 2637 irq_clear_status_flags(irq, IRQ_LEVEL);
2668 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2638 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2669 "edge"); 2639 "edge");
2670} 2640}
2671 2641
@@ -2749,7 +2719,7 @@ int timer_through_8259 __initdata;
2749 */ 2719 */
2750static inline void __init check_timer(void) 2720static inline void __init check_timer(void)
2751{ 2721{
2752 struct irq_cfg *cfg = get_irq_chip_data(0); 2722 struct irq_cfg *cfg = irq_get_chip_data(0);
2753 int node = cpu_to_node(0); 2723 int node = cpu_to_node(0);
2754 int apic1, pin1, apic2, pin2; 2724 int apic1, pin1, apic2, pin2;
2755 unsigned long flags; 2725 unsigned long flags;
@@ -2935,7 +2905,7 @@ void __init setup_IO_APIC(void)
2935} 2905}
2936 2906
2937/* 2907/*
2938 * Called after all the initialization is done. If we didnt find any 2908 * Called after all the initialization is done. If we didn't find any
2939 * APIC bugs then we can allow the modify fast path 2909 * APIC bugs then we can allow the modify fast path
2940 */ 2910 */
2941 2911
@@ -2948,89 +2918,84 @@ static int __init io_apic_bug_finalize(void)
2948 2918
2949late_initcall(io_apic_bug_finalize); 2919late_initcall(io_apic_bug_finalize);
2950 2920
2951struct sysfs_ioapic_data { 2921static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
2952 struct sys_device dev;
2953 struct IO_APIC_route_entry entry[0];
2954};
2955static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
2956 2922
2957static int ioapic_suspend(struct sys_device *dev, pm_message_t state) 2923static void suspend_ioapic(int ioapic_id)
2958{ 2924{
2959 struct IO_APIC_route_entry *entry; 2925 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2960 struct sysfs_ioapic_data *data;
2961 int i; 2926 int i;
2962 2927
2963 data = container_of(dev, struct sysfs_ioapic_data, dev); 2928 if (!saved_data)
2964 entry = data->entry; 2929 return;
2965 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) 2930
2966 *entry = ioapic_read_entry(dev->id, i); 2931 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2932 saved_data[i] = ioapic_read_entry(ioapic_id, i);
2933}
2934
2935static int ioapic_suspend(void)
2936{
2937 int ioapic_id;
2938
2939 for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
2940 suspend_ioapic(ioapic_id);
2967 2941
2968 return 0; 2942 return 0;
2969} 2943}
2970 2944
2971static int ioapic_resume(struct sys_device *dev) 2945static void resume_ioapic(int ioapic_id)
2972{ 2946{
2973 struct IO_APIC_route_entry *entry; 2947 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2974 struct sysfs_ioapic_data *data;
2975 unsigned long flags; 2948 unsigned long flags;
2976 union IO_APIC_reg_00 reg_00; 2949 union IO_APIC_reg_00 reg_00;
2977 int i; 2950 int i;
2978 2951
2979 data = container_of(dev, struct sysfs_ioapic_data, dev); 2952 if (!saved_data)
2980 entry = data->entry; 2953 return;
2981 2954
2982 raw_spin_lock_irqsave(&ioapic_lock, flags); 2955 raw_spin_lock_irqsave(&ioapic_lock, flags);
2983 reg_00.raw = io_apic_read(dev->id, 0); 2956 reg_00.raw = io_apic_read(ioapic_id, 0);
2984 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 2957 if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
2985 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 2958 reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
2986 io_apic_write(dev->id, 0, reg_00.raw); 2959 io_apic_write(ioapic_id, 0, reg_00.raw);
2987 } 2960 }
2988 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2961 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2989 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 2962 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2990 ioapic_write_entry(dev->id, i, entry[i]); 2963 ioapic_write_entry(ioapic_id, i, saved_data[i]);
2964}
2991 2965
2992 return 0; 2966static void ioapic_resume(void)
2967{
2968 int ioapic_id;
2969
2970 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2971 resume_ioapic(ioapic_id);
2993} 2972}
2994 2973
2995static struct sysdev_class ioapic_sysdev_class = { 2974static struct syscore_ops ioapic_syscore_ops = {
2996 .name = "ioapic",
2997 .suspend = ioapic_suspend, 2975 .suspend = ioapic_suspend,
2998 .resume = ioapic_resume, 2976 .resume = ioapic_resume,
2999}; 2977};
3000 2978
3001static int __init ioapic_init_sysfs(void) 2979static int __init ioapic_init_ops(void)
3002{ 2980{
3003 struct sys_device * dev; 2981 int i;
3004 int i, size, error;
3005 2982
3006 error = sysdev_class_register(&ioapic_sysdev_class); 2983 for (i = 0; i < nr_ioapics; i++) {
3007 if (error) 2984 unsigned int size;
3008 return error;
3009 2985
3010 for (i = 0; i < nr_ioapics; i++ ) { 2986 size = nr_ioapic_registers[i]
3011 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
3012 * sizeof(struct IO_APIC_route_entry); 2987 * sizeof(struct IO_APIC_route_entry);
3013 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); 2988 ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
3014 if (!mp_ioapic_data[i]) { 2989 if (!ioapic_saved_data[i])
3015 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); 2990 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
3016 continue;
3017 }
3018 dev = &mp_ioapic_data[i]->dev;
3019 dev->id = i;
3020 dev->cls = &ioapic_sysdev_class;
3021 error = sysdev_register(dev);
3022 if (error) {
3023 kfree(mp_ioapic_data[i]);
3024 mp_ioapic_data[i] = NULL;
3025 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3026 continue;
3027 }
3028 } 2991 }
3029 2992
2993 register_syscore_ops(&ioapic_syscore_ops);
2994
3030 return 0; 2995 return 0;
3031} 2996}
3032 2997
3033device_initcall(ioapic_init_sysfs); 2998device_initcall(ioapic_init_ops);
3034 2999
3035/* 3000/*
3036 * Dynamic irq allocate and deallocation 3001 * Dynamic irq allocate and deallocation
@@ -3060,7 +3025,7 @@ unsigned int create_irq_nr(unsigned int from, int node)
3060 raw_spin_unlock_irqrestore(&vector_lock, flags); 3025 raw_spin_unlock_irqrestore(&vector_lock, flags);
3061 3026
3062 if (ret) { 3027 if (ret) {
3063 set_irq_chip_data(irq, cfg); 3028 irq_set_chip_data(irq, cfg);
3064 irq_clear_status_flags(irq, IRQ_NOREQUEST); 3029 irq_clear_status_flags(irq, IRQ_NOREQUEST);
3065 } else { 3030 } else {
3066 free_irq_at(irq, cfg); 3031 free_irq_at(irq, cfg);
@@ -3085,7 +3050,7 @@ int create_irq(void)
3085 3050
3086void destroy_irq(unsigned int irq) 3051void destroy_irq(unsigned int irq)
3087{ 3052{
3088 struct irq_cfg *cfg = get_irq_chip_data(irq); 3053 struct irq_cfg *cfg = irq_get_chip_data(irq);
3089 unsigned long flags; 3054 unsigned long flags;
3090 3055
3091 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3056 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
@@ -3119,7 +3084,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3119 3084
3120 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3085 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3121 3086
3122 if (irq_remapped(get_irq_chip_data(irq))) { 3087 if (irq_remapped(cfg)) {
3123 struct irte irte; 3088 struct irte irte;
3124 int ir_index; 3089 int ir_index;
3125 u16 sub_handle; 3090 u16 sub_handle;
@@ -3291,6 +3256,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3291 3256
3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3257static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3293{ 3258{
3259 struct irq_chip *chip = &msi_chip;
3294 struct msi_msg msg; 3260 struct msi_msg msg;
3295 int ret; 3261 int ret;
3296 3262
@@ -3298,14 +3264,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3298 if (ret < 0) 3264 if (ret < 0)
3299 return ret; 3265 return ret;
3300 3266
3301 set_irq_msi(irq, msidesc); 3267 irq_set_msi_desc(irq, msidesc);
3302 write_msi_msg(irq, &msg); 3268 write_msi_msg(irq, &msg);
3303 3269
3304 if (irq_remapped(get_irq_chip_data(irq))) { 3270 if (irq_remapped(irq_get_chip_data(irq))) {
3305 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3271 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3306 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3272 chip = &msi_ir_chip;
3307 } else 3273 }
3308 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3274
3275 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3309 3276
3310 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3277 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3311 3278
@@ -3423,8 +3390,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3423 if (ret < 0) 3390 if (ret < 0)
3424 return ret; 3391 return ret;
3425 dmar_msi_write(irq, &msg); 3392 dmar_msi_write(irq, &msg);
3426 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3393 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3427 "edge"); 3394 "edge");
3428 return 0; 3395 return 0;
3429} 3396}
3430#endif 3397#endif
@@ -3482,6 +3449,7 @@ static struct irq_chip hpet_msi_type = {
3482 3449
3483int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3450int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3484{ 3451{
3452 struct irq_chip *chip = &hpet_msi_type;
3485 struct msi_msg msg; 3453 struct msi_msg msg;
3486 int ret; 3454 int ret;
3487 3455
@@ -3501,15 +3469,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3501 if (ret < 0) 3469 if (ret < 0)
3502 return ret; 3470 return ret;
3503 3471
3504 hpet_msi_write(get_irq_data(irq), &msg); 3472 hpet_msi_write(irq_get_handler_data(irq), &msg);
3505 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3473 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3506 if (irq_remapped(get_irq_chip_data(irq))) 3474 if (irq_remapped(irq_get_chip_data(irq)))
3507 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3475 chip = &ir_hpet_msi_type;
3508 handle_edge_irq, "edge");
3509 else
3510 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3511 handle_edge_irq, "edge");
3512 3476
3477 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3513 return 0; 3478 return 0;
3514} 3479}
3515#endif 3480#endif
@@ -3596,7 +3561,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3596 3561
3597 write_ht_irq_msg(irq, &msg); 3562 write_ht_irq_msg(irq, &msg);
3598 3563
3599 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3564 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3600 handle_edge_irq, "edge"); 3565 handle_edge_irq, "edge");
3601 3566
3602 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3567 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3605,7 +3570,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3605} 3570}
3606#endif /* CONFIG_HT_IRQ */ 3571#endif /* CONFIG_HT_IRQ */
3607 3572
3608int __init io_apic_get_redir_entries (int ioapic) 3573int
3574io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3575{
3576 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3577 int ret;
3578
3579 if (!cfg)
3580 return -EINVAL;
3581 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3582 if (!ret)
3583 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3584 attr->trigger, attr->polarity);
3585 return ret;
3586}
3587
3588static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3589 struct io_apic_irq_attr *attr)
3590{
3591 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3592 int ret;
3593
3594 /* Avoid redundant programming */
3595 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
3596 pr_debug("Pin %d-%d already programmed\n",
3597 mp_ioapics[id].apicid, pin);
3598 return 0;
3599 }
3600 ret = io_apic_setup_irq_pin(irq, node, attr);
3601 if (!ret)
3602 set_bit(pin, mp_ioapic_routing[id].pin_programmed);
3603 return ret;
3604}
3605
3606static int __init io_apic_get_redir_entries(int ioapic)
3609{ 3607{
3610 union IO_APIC_reg_01 reg_01; 3608 union IO_APIC_reg_01 reg_01;
3611 unsigned long flags; 3609 unsigned long flags;
@@ -3659,96 +3657,24 @@ int __init arch_probe_nr_irqs(void)
3659} 3657}
3660#endif 3658#endif
3661 3659
3662static int __io_apic_set_pci_routing(struct device *dev, int irq, 3660int io_apic_set_pci_routing(struct device *dev, int irq,
3663 struct io_apic_irq_attr *irq_attr) 3661 struct io_apic_irq_attr *irq_attr)
3664{ 3662{
3665 struct irq_cfg *cfg;
3666 int node; 3663 int node;
3667 int ioapic, pin;
3668 int trigger, polarity;
3669 3664
3670 ioapic = irq_attr->ioapic;
3671 if (!IO_APIC_IRQ(irq)) { 3665 if (!IO_APIC_IRQ(irq)) {
3672 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3666 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3673 ioapic); 3667 irq_attr->ioapic);
3674 return -EINVAL; 3668 return -EINVAL;
3675 } 3669 }
3676 3670
3677 if (dev) 3671 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3678 node = dev_to_node(dev);
3679 else
3680 node = cpu_to_node(0);
3681
3682 cfg = alloc_irq_and_cfg_at(irq, node);
3683 if (!cfg)
3684 return 0;
3685
3686 pin = irq_attr->ioapic_pin;
3687 trigger = irq_attr->trigger;
3688 polarity = irq_attr->polarity;
3689
3690 /*
3691 * IRQs < 16 are already in the irq_2_pin[] map
3692 */
3693 if (irq >= legacy_pic->nr_legacy_irqs) {
3694 if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
3695 printk(KERN_INFO "can not add pin %d for irq %d\n",
3696 pin, irq);
3697 return 0;
3698 }
3699 }
3700
3701 setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
3702
3703 return 0;
3704}
3705
3706int io_apic_set_pci_routing(struct device *dev, int irq,
3707 struct io_apic_irq_attr *irq_attr)
3708{
3709 int ioapic, pin;
3710 /*
3711 * Avoid pin reprogramming. PRTs typically include entries
3712 * with redundant pin->gsi mappings (but unique PCI devices);
3713 * we only program the IOAPIC on the first.
3714 */
3715 ioapic = irq_attr->ioapic;
3716 pin = irq_attr->ioapic_pin;
3717 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3718 pr_debug("Pin %d-%d already programmed\n",
3719 mp_ioapics[ioapic].apicid, pin);
3720 return 0;
3721 }
3722 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3723
3724 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3725}
3726
3727u8 __init io_apic_unique_id(u8 id)
3728{
3729#ifdef CONFIG_X86_32
3730 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3731 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3732 return io_apic_get_unique_id(nr_ioapics, id);
3733 else
3734 return id;
3735#else
3736 int i;
3737 DECLARE_BITMAP(used, 256);
3738 3672
3739 bitmap_zero(used, 256); 3673 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3740 for (i = 0; i < nr_ioapics; i++) {
3741 struct mpc_ioapic *ia = &mp_ioapics[i];
3742 __set_bit(ia->apicid, used);
3743 }
3744 if (!test_bit(id, used))
3745 return id;
3746 return find_first_zero_bit(used, 256);
3747#endif
3748} 3674}
3749 3675
3750#ifdef CONFIG_X86_32 3676#ifdef CONFIG_X86_32
3751int __init io_apic_get_unique_id(int ioapic, int apic_id) 3677static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3752{ 3678{
3753 union IO_APIC_reg_00 reg_00; 3679 union IO_APIC_reg_00 reg_00;
3754 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3680 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3821,9 +3747,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3821 3747
3822 return apic_id; 3748 return apic_id;
3823} 3749}
3750
3751static u8 __init io_apic_unique_id(u8 id)
3752{
3753 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3754 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3755 return io_apic_get_unique_id(nr_ioapics, id);
3756 else
3757 return id;
3758}
3759#else
3760static u8 __init io_apic_unique_id(u8 id)
3761{
3762 int i;
3763 DECLARE_BITMAP(used, 256);
3764
3765 bitmap_zero(used, 256);
3766 for (i = 0; i < nr_ioapics; i++) {
3767 struct mpc_ioapic *ia = &mp_ioapics[i];
3768 __set_bit(ia->apicid, used);
3769 }
3770 if (!test_bit(id, used))
3771 return id;
3772 return find_first_zero_bit(used, 256);
3773}
3824#endif 3774#endif
3825 3775
3826int __init io_apic_get_version(int ioapic) 3776static int __init io_apic_get_version(int ioapic)
3827{ 3777{
3828 union IO_APIC_reg_01 reg_01; 3778 union IO_APIC_reg_01 reg_01;
3829 unsigned long flags; 3779 unsigned long flags;
@@ -3868,8 +3818,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
3868void __init setup_ioapic_dest(void) 3818void __init setup_ioapic_dest(void)
3869{ 3819{
3870 int pin, ioapic, irq, irq_entry; 3820 int pin, ioapic, irq, irq_entry;
3871 struct irq_desc *desc;
3872 const struct cpumask *mask; 3821 const struct cpumask *mask;
3822 struct irq_data *idata;
3873 3823
3874 if (skip_ioapic_setup == 1) 3824 if (skip_ioapic_setup == 1)
3875 return; 3825 return;
@@ -3884,21 +3834,20 @@ void __init setup_ioapic_dest(void)
3884 if ((ioapic > 0) && (irq > 16)) 3834 if ((ioapic > 0) && (irq > 16))
3885 continue; 3835 continue;
3886 3836
3887 desc = irq_to_desc(irq); 3837 idata = irq_get_irq_data(irq);
3888 3838
3889 /* 3839 /*
3890 * Honour affinities which have been set in early boot 3840 * Honour affinities which have been set in early boot
3891 */ 3841 */
3892 if (desc->status & 3842 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
3893 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3843 mask = idata->affinity;
3894 mask = desc->irq_data.affinity;
3895 else 3844 else
3896 mask = apic->target_cpus(); 3845 mask = apic->target_cpus();
3897 3846
3898 if (intr_remapping_enabled) 3847 if (intr_remapping_enabled)
3899 ir_ioapic_set_affinity(&desc->irq_data, mask, false); 3848 ir_ioapic_set_affinity(idata, mask, false);
3900 else 3849 else
3901 ioapic_set_affinity(&desc->irq_data, mask, false); 3850 ioapic_set_affinity(idata, mask, false);
3902 } 3851 }
3903 3852
3904} 3853}
@@ -4002,6 +3951,9 @@ int mp_find_ioapic(u32 gsi)
4002{ 3951{
4003 int i = 0; 3952 int i = 0;
4004 3953
3954 if (nr_ioapics == 0)
3955 return -1;
3956
4005 /* Find the IOAPIC that manages this GSI. */ 3957 /* Find the IOAPIC that manages this GSI. */
4006 for (i = 0; i < nr_ioapics; i++) { 3958 for (i = 0; i < nr_ioapics; i++) {
4007 if ((gsi >= mp_gsi_routing[i].gsi_base) 3959 if ((gsi >= mp_gsi_routing[i].gsi_base)
@@ -4023,10 +3975,10 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
4023 return gsi - mp_gsi_routing[ioapic].gsi_base; 3975 return gsi - mp_gsi_routing[ioapic].gsi_base;
4024} 3976}
4025 3977
4026static int bad_ioapic(unsigned long address) 3978static __init int bad_ioapic(unsigned long address)
4027{ 3979{
4028 if (nr_ioapics >= MAX_IO_APICS) { 3980 if (nr_ioapics >= MAX_IO_APICS) {
4029 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3981 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
4030 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3982 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4031 return 1; 3983 return 1;
4032 } 3984 }
@@ -4083,20 +4035,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4083/* Enable IOAPIC early just for system timer */ 4035/* Enable IOAPIC early just for system timer */
4084void __init pre_init_apic_IRQ0(void) 4036void __init pre_init_apic_IRQ0(void)
4085{ 4037{
4086 struct irq_cfg *cfg; 4038 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4087 4039
4088 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4040 printk(KERN_INFO "Early APIC setup for system timer0\n");
4089#ifndef CONFIG_SMP 4041#ifndef CONFIG_SMP
4090 physid_set_mask_of_physid(boot_cpu_physical_apicid, 4042 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4091 &phys_cpu_present_map); 4043 &phys_cpu_present_map);
4092#endif 4044#endif
4093 /* Make sure the irq descriptor is set up */
4094 cfg = alloc_irq_and_cfg_at(0, 0);
4095
4096 setup_local_APIC(); 4045 setup_local_APIC();
4097 4046
4098 add_pin_to_irq_node(cfg, 0, 0, 0); 4047 io_apic_setup_irq_pin(0, 0, &attr);
4099 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4048 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4100 4049 "edge");
4101 setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
4102} 4050}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 960f26ab5c9f..6273eee5134b 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
373 return physids_promote(0xFUL, retmap); 373 return physids_promote(0xFUL, retmap);
374} 374}
375 375
376static inline int numaq_cpu_to_logical_apicid(int cpu)
377{
378 if (cpu >= nr_cpu_ids)
379 return BAD_APICID;
380 return cpu_2_logical_apicid[cpu];
381}
382
383/* 376/*
384 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 377 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
385 * cpu to APIC ID relation to properly interact with the intelligent 378 * cpu to APIC ID relation to properly interact with the intelligent
@@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
398 return logical_apicid >> 4; 391 return logical_apicid >> 4;
399} 392}
400 393
394static int numaq_numa_cpu_node(int cpu)
395{
396 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
397
398 if (logical_apicid != BAD_APICID)
399 return numaq_apicid_to_node(logical_apicid);
400 return NUMA_NO_NODE;
401}
402
401static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 403static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
402{ 404{
403 int node = numaq_apicid_to_node(logical_apicid); 405 int node = numaq_apicid_to_node(logical_apicid);
@@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = {
508 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 510 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
509 .setup_apic_routing = numaq_setup_apic_routing, 511 .setup_apic_routing = numaq_setup_apic_routing,
510 .multi_timer_check = numaq_multi_timer_check, 512 .multi_timer_check = numaq_multi_timer_check,
511 .apicid_to_node = numaq_apicid_to_node,
512 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 513 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 514 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
515 .setup_portio_remap = numaq_setup_portio_remap, 515 .setup_portio_remap = numaq_setup_portio_remap,
@@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = {
547 .icr_write = native_apic_icr_write, 547 .icr_write = native_apic_icr_write,
548 .wait_icr_idle = native_apic_wait_icr_idle, 548 .wait_icr_idle = native_apic_wait_icr_idle,
549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 549 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
550
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
550}; 553};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..fc84c7b61108 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void)
77 apic->setup_apic_routing(); 77 apic->setup_apic_routing();
78} 78}
79 79
80static int default_x86_32_early_logical_apicid(int cpu)
81{
82 return 1 << cpu;
83}
84
80static void setup_apic_flat_routing(void) 85static void setup_apic_flat_routing(void)
81{ 86{
82#ifdef CONFIG_X86_IO_APIC 87#ifdef CONFIG_X86_IO_APIC
@@ -130,8 +135,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 135 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 136 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 137 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 138 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 139 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 140 .setup_portio_remap = NULL,
@@ -167,6 +170,9 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 170 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 171 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
170}; 176};
171 177
172extern struct apic apic_numaq; 178extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..e4b8059b414a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,7 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
568}; 555};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..90949bbd566d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 206 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 207 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 208 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 209 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 210 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 211 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..c7e6d6645bf4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 195 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 196 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 197 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 198 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 199 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 200 .setup_portio_remap = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58b8850..33b10a0fc095 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -23,6 +23,8 @@
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <linux/delay.h>
27#include <linux/crash_dump.h>
26 28
27#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
28#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
@@ -34,6 +36,7 @@
34#include <asm/ipi.h> 36#include <asm/ipi.h>
35#include <asm/smp.h> 37#include <asm/smp.h>
36#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h>
37 40
38DEFINE_PER_CPU(int, x2apic_extra_bits); 41DEFINE_PER_CPU(int, x2apic_extra_bits);
39 42
@@ -338,8 +341,6 @@ struct apic __refdata apic_x2apic_uv_x = {
338 .ioapic_phys_id_map = NULL, 341 .ioapic_phys_id_map = NULL,
339 .setup_apic_routing = NULL, 342 .setup_apic_routing = NULL,
340 .multi_timer_check = NULL, 343 .multi_timer_check = NULL,
341 .apicid_to_node = NULL,
342 .cpu_to_logical_apicid = NULL,
343 .cpu_present_to_apicid = default_cpu_present_to_apicid, 344 .cpu_present_to_apicid = default_cpu_present_to_apicid,
344 .apicid_to_cpu_present = NULL, 345 .apicid_to_cpu_present = NULL,
345 .setup_portio_remap = NULL, 346 .setup_portio_remap = NULL,
@@ -812,4 +813,11 @@ void __init uv_system_init(void)
812 813
813 /* register Legacy VGA I/O redirection handler */ 814 /* register Legacy VGA I/O redirection handler */
814 pci_register_set_vga_state(uv_set_vga_state); 815 pci_register_set_vga_state(uv_set_vga_state);
816
817 /*
818 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
819 * EFI is not enabled in the kdump kernel.
820 */
821 if (is_kdump_kernel())
822 reboot_type = BOOT_ACPI;
815} 823}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a746..0b4be431c620 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -227,6 +227,7 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -975,20 +976,10 @@ recalc:
975 976
976static void apm_power_off(void) 977static void apm_power_off(void)
977{ 978{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 979 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 980 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 981 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 982 machine_real_restart(MRR_APM);
992 } else { 983 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 984 (void)set_system_power_state(APM_STATE_OFF);
994 } 985 }
@@ -2331,12 +2322,11 @@ static int __init apm_init(void)
2331 apm_info.disabled = 1; 2322 apm_info.disabled = 1;
2332 return -ENODEV; 2323 return -ENODEV;
2333 } 2324 }
2334 if (pm_flags & PM_ACPI) { 2325 if (!acpi_disabled) {
2335 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2326 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2336 apm_info.disabled = 1; 2327 apm_info.disabled = 1;
2337 return -ENODEV; 2328 return -ENODEV;
2338 } 2329 }
2339 pm_flags |= PM_APM;
2340 2330
2341 /* 2331 /*
2342 * Set up the long jump entry point to the APM BIOS, which is called 2332 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2428,7 +2418,6 @@ static void __exit apm_exit(void)
2428 kthread_stop(kapmd_task); 2418 kthread_stop(kapmd_task);
2429 kapmd_task = NULL; 2419 kapmd_task = NULL;
2430 } 2420 }
2431 pm_flags &= ~PM_APM;
2432} 2421}
2433 2422
2434module_init(apm_init); 2423module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37a..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,42 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
103
104 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
105
106#ifdef CONFIG_PARAVIRT
107 BLANK();
108 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
109 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
110 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif
117
118#ifdef CONFIG_XEN
119 BLANK();
120 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
121 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
122#endif
123
124#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
125 BLANK(); 63 BLANK();
126 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -139,11 +77,4 @@ void foo(void)
139 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
140 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
141#endif 79#endif
142
143 BLANK();
144 OFFSET(BP_scratch, boot_params, scratch);
145 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
146 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
147 OFFSET(BP_version, boot_params, hdr.version);
148 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 13a389179514..452932d34730 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -106,8 +106,8 @@ void __init setup_bios_corruption_check(void)
106 addr += size; 106 addr += size;
107 } 107 }
108 108
109 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", 109 if (num_scan_areas)
110 num_scan_areas); 110 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
111} 111}
112 112
113 113
@@ -143,12 +143,12 @@ static void check_corruption(struct work_struct *dummy)
143{ 143{
144 check_for_bios_corruption(); 144 check_for_bios_corruption();
145 schedule_delayed_work(&bios_check_work, 145 schedule_delayed_work(&bios_check_work,
146 round_jiffies_relative(corruption_check_period*HZ)); 146 round_jiffies_relative(corruption_check_period*HZ));
147} 147}
148 148
149static int start_periodic_check_for_corruption(void) 149static int start_periodic_check_for_corruption(void)
150{ 150{
151 if (!memory_corruption_check || corruption_check_period == 0) 151 if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
152 return 0; 152 return 0;
153 153
154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", 154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c7bedb83c5a..3532d3bf8105 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid)
261#ifdef CONFIG_X86_HT 265#ifdef CONFIG_X86_HT
262static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) 266static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
263{ 267{
264 u32 nodes; 268 u32 nodes, cores_per_cu = 1;
265 u8 node_id; 269 u8 node_id;
266 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
267 271
@@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
276 /* get compute unit information */ 280 /* get compute unit information */
277 smp_num_siblings = ((ebx >> 8) & 3) + 1; 281 smp_num_siblings = ((ebx >> 8) & 3) + 1;
278 c->compute_unit_id = ebx & 0xff; 282 c->compute_unit_id = ebx & 0xff;
283 cores_per_cu += ((ebx >> 8) & 3);
279 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { 284 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
280 u64 value; 285 u64 value;
281 286
@@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
288 /* fixup multi-node processor information */ 293 /* fixup multi-node processor information */
289 if (nodes > 1) { 294 if (nodes > 1) {
290 u32 cores_per_node; 295 u32 cores_per_node;
296 u32 cus_per_node;
291 297
292 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 298 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
293 cores_per_node = c->x86_max_cores / nodes; 299 cores_per_node = c->x86_max_cores / nodes;
300 cus_per_node = cores_per_node / cores_per_cu;
294 301
295 /* store NodeID, use llc_shared_map to store sibling info */ 302 /* store NodeID, use llc_shared_map to store sibling info */
296 per_cpu(cpu_llc_id, cpu) = node_id; 303 per_cpu(cpu_llc_id, cpu) = node_id;
297 304
298 /* core id to be in range from 0 to (cores_per_node - 1) */ 305 /* core id has to be in the [0 .. cores_per_node - 1] range */
299 c->cpu_core_id = c->cpu_core_id % cores_per_node; 306 c->cpu_core_id %= cores_per_node;
307 c->compute_unit_id %= cus_per_node;
300 } 308 }
301} 309}
302#endif 310#endif
@@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
334 342
335static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
336{ 344{
337#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
338 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
339 int node; 347 int node;
340 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
341 349
342 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
343 353
344 if (apicid_to_node[apicid] != NUMA_NO_NODE)
345 node = apicid_to_node[apicid];
346 if (!node_online(node)) { 354 if (!node_online(node)) {
347 /* Two possibilities here: 355 /*
348 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
349 In that case try picking one from a nearby CPU 357 *
350 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
351 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
352 Assume they are all increased by a constant offset, 360 *
353 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
354 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
355 path for the previous case. */ 363 * they are all increased by a constant offset, but in
356 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
357 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
358 375
359 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
360 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
361 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
362 /* Pick a nearby node */ 379 /* Pick a nearby node */
363 if (!node_online(node)) 380 if (!node_online(node))
364 node = nearby_node(apicid); 381 node = nearby_node(apicid);
@@ -594,6 +611,29 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
594 } 611 }
595 } 612 }
596#endif 613#endif
614
615 /* As a rule processors have APIC timer running in deep C states */
616 if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
617 set_cpu_cap(c, X86_FEATURE_ARAT);
618
619 /*
620 * Disable GART TLB Walk Errors on Fam10h. We do this here
621 * because this is always needed when GART is enabled, even in a
622 * kernel which has no MCE support built in.
623 */
624 if (c->x86 == 0x10) {
625 /*
626 * BIOS should disable GartTlbWlk Errors themself. If
627 * it doesn't do it here as suggested by the BKDG.
628 *
629 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
630 */
631 u64 mask;
632
633 rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
634 mask |= (1 << 10);
635 wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
636 }
597} 637}
598 638
599#ifdef CONFIG_X86_32 639#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396bd..e2ced0074a45 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -675,7 +675,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 675 const struct cpu_dev *const *cdev;
676 int count = 0; 676 int count = 0;
677 677
678#ifdef PROCESSOR_SELECT 678#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 679 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 680#endif
681 681
@@ -687,7 +687,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 687 cpu_devs[count] = cpudev;
688 count++; 688 count++;
689 689
690#ifdef PROCESSOR_SELECT 690#ifdef CONFIG_PROCESSOR_SELECT
691 { 691 {
692 unsigned int j; 692 unsigned int j;
693 693
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
869 869
870 select_idle_routine(c); 870 select_idle_routine(c);
871 871
872#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 872#ifdef CONFIG_NUMA
873 numa_add_cpu(smp_processor_id()); 873 numa_add_cpu(smp_processor_id());
874#endif 874#endif
875} 875}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 03162dac6271..cf48cdd6907d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -444,7 +444,7 @@ static int __cpuinit longhaul_get_ranges(void)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* Get max multiplier - as we always did. 446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled. 447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */ 448 * C3 is booting at max anyway. */
449 maxmult = mult; 449 maxmult = mult;
450 /* Get min multiplier */ 450 /* Get min multiplier */
@@ -1011,7 +1011,7 @@ static void __exit longhaul_exit(void)
1011 * trigger frequency transition in some cases. */ 1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644); 1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save 1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */ 1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644); 1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index bd1cac747f67..52c93648e492 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -158,9 +158,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{ 158{
159 if (c->x86 == 0x06) { 159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST)) 160 if (cpu_has(c, X86_FEATURE_EST))
161 printk(KERN_WARNING PFX "Warning: EST-capable CPU " 161 printk_once(KERN_WARNING PFX "Warning: EST-capable "
162 "detected. The acpi-cpufreq module offers " 162 "CPU detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition of frequency " 163 "voltage scaling in addition to frequency "
164 "scaling. You should use that instead of " 164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n"); 165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) { 166 switch (c->x86_model) {
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 4f6f679f2799..755a31e0f5b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -195,7 +195,7 @@ static unsigned int pcc_get_freq(unsigned int cpu)
195cmd_incomplete: 195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status); 196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock); 197 spin_unlock(&pcc_lock);
198 return -EINVAL; 198 return 0;
199} 199}
200 200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy, 201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
@@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
315 315
316 input.count = 4; 316 input.count = 4;
317 input.pointer = in_params; 317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER; 318 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16; 319 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID; 320 in_params[0].buffer.pointer = OSC_UUID;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 35c7e65e59be..2368e38327b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data)
630 data->powernow_table[j].frequency/1000); 630 data->powernow_table[j].frequency/1000);
631 } else { 631 } else {
632 printk(KERN_INFO PFX 632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n", 633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff, 634 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000, 635 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8); 636 data->powernow_table[j].index >> 8);
@@ -1276,7 +1275,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1276 1275
1277 if (powernow_k8_cpu_init_acpi(data)) { 1276 if (powernow_k8_cpu_init_acpi(data)) {
1278 /* 1277 /*
1279 * Use the PSB BIOS structure. This is only availabe on 1278 * Use the PSB BIOS structure. This is only available on
1280 * an UP version, and is deprecated by AMD. 1279 * an UP version, and is deprecated by AMD.
1281 */ 1280 */
1282 if (num_online_cpus() != 1) { 1281 if (num_online_cpus() != 1) {
@@ -1537,6 +1536,7 @@ static struct notifier_block cpb_nb = {
1537static int __cpuinit powernowk8_init(void) 1536static int __cpuinit powernowk8_init(void)
1538{ 1537{
1539 unsigned int i, supported_cpus = 0, cpu; 1538 unsigned int i, supported_cpus = 0, cpu;
1539 int rv;
1540 1540
1541 for_each_online_cpu(i) { 1541 for_each_online_cpu(i) {
1542 int rc; 1542 int rc;
@@ -1555,14 +1555,14 @@ static int __cpuinit powernowk8_init(void)
1555 1555
1556 cpb_capable = true; 1556 cpb_capable = true;
1557 1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc(); 1558 msrs = msrs_alloc();
1561 if (!msrs) { 1559 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); 1560 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM; 1561 return -ENOMEM;
1564 } 1562 }
1565 1563
1564 register_cpu_notifier(&cpb_nb);
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); 1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567 1567
1568 for_each_cpu(cpu, cpu_online_mask) { 1568 for_each_cpu(cpu, cpu_online_mask) {
@@ -1574,7 +1574,13 @@ static int __cpuinit powernowk8_init(void)
1574 (cpb_enabled ? "on" : "off")); 1574 (cpb_enabled ? "on" : "off"));
1575 } 1575 }
1576 1576
1577 return cpufreq_register_driver(&cpufreq_amd64_driver); 1577 rv = cpufreq_register_driver(&cpufreq_amd64_driver);
1578 if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
1579 unregister_cpu_notifier(&cpb_nb);
1580 msrs_free(msrs);
1581 msrs = NULL;
1582 }
1583 return rv;
1578} 1584}
1579 1585
1580/* driver entry point for term */ 1586/* driver entry point for term */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8abd869baabf..91bc25b67bc1 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
292 292
293 result = speedstep_smi_ownership(); 293 result = speedstep_smi_ownership();
294 if (result) { 294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n"); 295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL; 296 return -EINVAL;
297 } 297 }
298 298
@@ -360,7 +360,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
360 int result = speedstep_smi_ownership(); 360 int result = speedstep_smi_ownership();
361 361
362 if (result) 362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n"); 363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364 364
365 return result; 365 return result;
366} 366}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d16c2c53d6bf..df86bc8c859d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 276
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 278{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 279#ifdef CONFIG_NUMA
280 unsigned node; 280 unsigned node;
281 int cpu = smp_processor_id(); 281 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 282
284 /* Don't do the funky fallback heuristics the AMD version employs 283 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 284 for now. */
286 node = apicid_to_node[apicid]; 285 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE || !node_online(node)) { 286 if (node == NUMA_NO_NODE || !node_online(node)) {
288 /* reuse the value from init_cpu_to_node() */ 287 /* reuse the value from init_cpu_to_node() */
289 node = cpu_to_node(cpu); 288 node = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7283e98deaae..1ce1af2899df 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
48 { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 49 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 50 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ 67 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ 68 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ 69 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
70 { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ 71 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 89 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ 90 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 91 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
92 { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 94 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ 95 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
@@ -301,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
301 304
302struct _cache_attr { 305struct _cache_attr {
303 struct attribute attr; 306 struct attribute attr;
304 ssize_t (*show)(struct _cpuid4_info *, char *); 307 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
305 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
309 unsigned int);
306}; 310};
307 311
308#ifdef CONFIG_AMD_NB 312#ifdef CONFIG_AMD_NB
@@ -397,7 +401,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
397 401
398#define SHOW_CACHE_DISABLE(slot) \ 402#define SHOW_CACHE_DISABLE(slot) \
399static ssize_t \ 403static ssize_t \
400show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ 404show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
405 unsigned int cpu) \
401{ \ 406{ \
402 return show_cache_disable(this_leaf, buf, slot); \ 407 return show_cache_disable(this_leaf, buf, slot); \
403} 408}
@@ -509,7 +514,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
509#define STORE_CACHE_DISABLE(slot) \ 514#define STORE_CACHE_DISABLE(slot) \
510static ssize_t \ 515static ssize_t \
511store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 516store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
512 const char *buf, size_t count) \ 517 const char *buf, size_t count, \
518 unsigned int cpu) \
513{ \ 519{ \
514 return store_cache_disable(this_leaf, buf, count, slot); \ 520 return store_cache_disable(this_leaf, buf, count, slot); \
515} 521}
@@ -521,6 +527,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
521static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 527static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
522 show_cache_disable_1, store_cache_disable_1); 528 show_cache_disable_1, store_cache_disable_1);
523 529
530static ssize_t
531show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
532{
533 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
534 return -EINVAL;
535
536 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
537}
538
539static ssize_t
540store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
541 unsigned int cpu)
542{
543 unsigned long val;
544
545 if (!capable(CAP_SYS_ADMIN))
546 return -EPERM;
547
548 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
549 return -EINVAL;
550
551 if (strict_strtoul(buf, 16, &val) < 0)
552 return -EINVAL;
553
554 if (amd_set_subcaches(cpu, val))
555 return -EINVAL;
556
557 return count;
558}
559
560static struct _cache_attr subcaches =
561 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
562
524#else /* CONFIG_AMD_NB */ 563#else /* CONFIG_AMD_NB */
525#define amd_init_l3_cache(x, y) 564#define amd_init_l3_cache(x, y)
526#endif /* CONFIG_AMD_NB */ 565#endif /* CONFIG_AMD_NB */
@@ -529,9 +568,9 @@ static int
529__cpuinit cpuid4_cache_lookup_regs(int index, 568__cpuinit cpuid4_cache_lookup_regs(int index,
530 struct _cpuid4_info_regs *this_leaf) 569 struct _cpuid4_info_regs *this_leaf)
531{ 570{
532 union _cpuid4_leaf_eax eax; 571 union _cpuid4_leaf_eax eax;
533 union _cpuid4_leaf_ebx ebx; 572 union _cpuid4_leaf_ebx ebx;
534 union _cpuid4_leaf_ecx ecx; 573 union _cpuid4_leaf_ecx ecx;
535 unsigned edx; 574 unsigned edx;
536 575
537 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 576 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
@@ -729,11 +768,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
729 struct cpuinfo_x86 *c = &cpu_data(cpu); 768 struct cpuinfo_x86 *c = &cpu_data(cpu);
730 769
731 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 770 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
732 for_each_cpu(i, c->llc_shared_map) { 771 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
733 if (!per_cpu(ici_cpuid4_info, i)) 772 if (!per_cpu(ici_cpuid4_info, i))
734 continue; 773 continue;
735 this_leaf = CPUID4_INFO_IDX(i, index); 774 this_leaf = CPUID4_INFO_IDX(i, index);
736 for_each_cpu(sibling, c->llc_shared_map) { 775 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
737 if (!cpu_online(sibling)) 776 if (!cpu_online(sibling))
738 continue; 777 continue;
739 set_bit(sibling, this_leaf->shared_cpu_map); 778 set_bit(sibling, this_leaf->shared_cpu_map);
@@ -867,8 +906,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
867#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) 906#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
868 907
869#define show_one_plus(file_name, object, val) \ 908#define show_one_plus(file_name, object, val) \
870static ssize_t show_##file_name \ 909static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
871 (struct _cpuid4_info *this_leaf, char *buf) \ 910 unsigned int cpu) \
872{ \ 911{ \
873 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 912 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
874} 913}
@@ -879,7 +918,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
879show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); 918show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
880show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); 919show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
881 920
882static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 921static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
922 unsigned int cpu)
883{ 923{
884 return sprintf(buf, "%luK\n", this_leaf->size / 1024); 924 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
885} 925}
@@ -903,17 +943,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
903 return n; 943 return n;
904} 944}
905 945
906static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) 946static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
947 unsigned int cpu)
907{ 948{
908 return show_shared_cpu_map_func(leaf, 0, buf); 949 return show_shared_cpu_map_func(leaf, 0, buf);
909} 950}
910 951
911static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) 952static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
953 unsigned int cpu)
912{ 954{
913 return show_shared_cpu_map_func(leaf, 1, buf); 955 return show_shared_cpu_map_func(leaf, 1, buf);
914} 956}
915 957
916static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) 958static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
959 unsigned int cpu)
917{ 960{
918 switch (this_leaf->eax.split.type) { 961 switch (this_leaf->eax.split.type) {
919 case CACHE_TYPE_DATA: 962 case CACHE_TYPE_DATA:
@@ -971,6 +1014,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
971 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 1014 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
972 n += 2; 1015 n += 2;
973 1016
1017 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1018 n += 1;
1019
974 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); 1020 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
975 if (attrs == NULL) 1021 if (attrs == NULL)
976 return attrs = default_attrs; 1022 return attrs = default_attrs;
@@ -983,6 +1029,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
983 attrs[n++] = &cache_disable_1.attr; 1029 attrs[n++] = &cache_disable_1.attr;
984 } 1030 }
985 1031
1032 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1033 attrs[n++] = &subcaches.attr;
1034
986 return attrs; 1035 return attrs;
987} 1036}
988#endif 1037#endif
@@ -995,7 +1044,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
995 1044
996 ret = fattr->show ? 1045 ret = fattr->show ?
997 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1046 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
998 buf) : 1047 buf, this_leaf->cpu) :
999 0; 1048 0;
1000 return ret; 1049 return ret;
1001} 1050}
@@ -1009,7 +1058,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
1009 1058
1010 ret = fattr->store ? 1059 ret = fattr->store ?
1011 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1060 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1012 buf, count) : 1061 buf, count, this_leaf->cpu) :
1013 0; 1062 0;
1014 return ret; 1063 return ret;
1015} 1064}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
106ssize_t apei_read_mce(struct mce *m, u64 *record_id) 106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{ 107{
108 struct cper_mce_record rcd; 108 struct cper_mce_record rcd;
109 ssize_t len; 109 int rc, pos;
110 110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd)); 111 rc = erst_get_record_id_begin(&pos);
112 if (len <= 0) 112 if (rc)
113 return len; 113 return rc;
114 /* Can not skip other records in storage via ERST unless clear them */ 114retry:
115 else if (len != sizeof(rcd) || 115 rc = erst_get_record_id_next(&pos, record_id);
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { 116 if (rc)
117 if (printk_ratelimit()) 117 goto out;
118 pr_warning( 118 /* no more record */
119 "MCE-APEI: Can not skip the unknown record in ERST"); 119 if (*record_id == APEI_ERST_INVALID_RECORD_ID)
120 return -EIO; 120 goto out;
121 } 121 rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
122 122 /* someone else has cleared the record, try next one */
123 if (rc == -ENOENT)
124 goto retry;
125 else if (rc < 0)
126 goto out;
127 /* try to skip other type records in storage */
128 else if (rc != sizeof(rcd) ||
129 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
130 goto retry;
123 memcpy(m, &rcd.mce, sizeof(*m)); 131 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id; 132 rc = sizeof(*m);
133out:
134 erst_get_record_id_end();
125 135
126 return sizeof(*m); 136 return rc;
127} 137}
128 138
129/* Check whether there is record in ERST */ 139/* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a77971979564..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -32,7 +32,7 @@ static void inject_mce(struct mce *m)
32{ 32{
33 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
34 34
35 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
36 i->finished = 0; 36 i->finished = 0;
37 mb(); 37 mb();
38 m->finished = 0; 38 m->finished = 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183b7f9c..3385ea26f684 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/syscore_ops.h>
24#include <linux/delay.h> 25#include <linux/delay.h>
25#include <linux/ctype.h> 26#include <linux/ctype.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
@@ -881,7 +882,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 882 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 883 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 884 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 885 * parser). So only support physical addresses up to page granuality for now.
885 */ 886 */
886static int mce_usable_address(struct mce *m) 887static int mce_usable_address(struct mce *m)
887{ 888{
@@ -1625,7 +1626,7 @@ out:
1625static unsigned int mce_poll(struct file *file, poll_table *wait) 1626static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{ 1627{
1627 poll_wait(file, &mce_wait, wait); 1628 poll_wait(file, &mce_wait, wait);
1628 if (rcu_dereference_check_mce(mcelog.next)) 1629 if (rcu_access_index(mcelog.next))
1629 return POLLIN | POLLRDNORM; 1630 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce()) 1631 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM; 1632 return POLLIN | POLLRDNORM;
@@ -1749,14 +1750,14 @@ static int mce_disable_error_reporting(void)
1749 return 0; 1750 return 0;
1750} 1751}
1751 1752
1752static int mce_suspend(struct sys_device *dev, pm_message_t state) 1753static int mce_suspend(void)
1753{ 1754{
1754 return mce_disable_error_reporting(); 1755 return mce_disable_error_reporting();
1755} 1756}
1756 1757
1757static int mce_shutdown(struct sys_device *dev) 1758static void mce_shutdown(void)
1758{ 1759{
1759 return mce_disable_error_reporting(); 1760 mce_disable_error_reporting();
1760} 1761}
1761 1762
1762/* 1763/*
@@ -1764,14 +1765,18 @@ static int mce_shutdown(struct sys_device *dev)
1764 * Only one CPU is active at this time, the others get re-added later using 1765 * Only one CPU is active at this time, the others get re-added later using
1765 * CPU hotplug: 1766 * CPU hotplug:
1766 */ 1767 */
1767static int mce_resume(struct sys_device *dev) 1768static void mce_resume(void)
1768{ 1769{
1769 __mcheck_cpu_init_generic(); 1770 __mcheck_cpu_init_generic();
1770 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1771 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1771
1772 return 0;
1773} 1772}
1774 1773
1774static struct syscore_ops mce_syscore_ops = {
1775 .suspend = mce_suspend,
1776 .shutdown = mce_shutdown,
1777 .resume = mce_resume,
1778};
1779
1775static void mce_cpu_restart(void *data) 1780static void mce_cpu_restart(void *data)
1776{ 1781{
1777 del_timer_sync(&__get_cpu_var(mce_timer)); 1782 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1808,9 +1813,6 @@ static void mce_enable_ce(void *all)
1808} 1813}
1809 1814
1810static struct sysdev_class mce_sysclass = { 1815static struct sysdev_class mce_sysclass = {
1811 .suspend = mce_suspend,
1812 .shutdown = mce_shutdown,
1813 .resume = mce_resume,
1814 .name = "machinecheck", 1816 .name = "machinecheck",
1815}; 1817};
1816 1818
@@ -2139,6 +2141,7 @@ static __init int mcheck_init_device(void)
2139 return err; 2141 return err;
2140 } 2142 }
2141 2143
2144 register_syscore_ops(&mce_syscore_ops);
2142 register_hotcpu_notifier(&mce_cpu_notifier); 2145 register_hotcpu_notifier(&mce_cpu_notifier);
2143 misc_register(&mce_log_device); 2146 misc_register(&mce_log_device);
2144 2147
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac52aca..167f97b5596e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 int i, err = 0; 527 int i, err = 0;
528 struct threshold_bank *b = NULL; 528 struct threshold_bank *b = NULL;
529 char name[32]; 529 char name[32];
530#ifdef CONFIG_SMP
531 struct cpuinfo_x86 *c = &cpu_data(cpu);
532#endif
533 530
534 sprintf(name, "threshold_bank%i", bank); 531 sprintf(name, "threshold_bank%i", bank);
535 532
536#ifdef CONFIG_SMP 533#ifdef CONFIG_SMP
537 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 534 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
538 i = cpumask_first(c->llc_shared_map); 535 i = cpumask_first(cpu_llc_shared_mask(cpu));
539 536
540 /* first core not up yet */ 537 /* first core not up yet */
541 if (cpu_data(i).cpu_core_id) 538 if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
555 if (err) 552 if (err)
556 goto out; 553 goto out;
557 554
558 cpumask_copy(b->cpus, c->llc_shared_map); 555 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
559 per_cpu(threshold_banks, cpu)[bank] = b; 556 per_cpu(threshold_banks, cpu)[bank] = b;
560 557
561 goto out; 558 goto out;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e12246ff5aa6..6f8c5e9da97f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -59,6 +59,7 @@ struct thermal_state {
59 59
60/* Callback to handle core threshold interrupts */ 60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val); 61int (*platform_thermal_notify)(__u64 msr_val);
62EXPORT_SYMBOL(platform_thermal_notify);
62 63
63static DEFINE_PER_CPU(struct thermal_state, thermal_state); 64static DEFINE_PER_CPU(struct thermal_state, thermal_state);
64 65
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9f27228ceffd..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc3..929739a653d1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
45#include <linux/cpu.h> 45#include <linux/cpu.h>
46#include <linux/pci.h> 46#include <linux/pci.h>
47#include <linux/smp.h> 47#include <linux/smp.h>
48#include <linux/syscore_ops.h>
48 49
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/e820.h> 51#include <asm/e820.h>
@@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
292 293
293 /* 294 /*
294 * HACK! 295 * HACK!
295 * We use this same function to initialize the mtrrs on boot. 296 *
296 * The state of the boot cpu's mtrrs has been saved, and we want 297 * We use this same function to initialize the mtrrs during boot,
297 * to replicate across all the APs. 298 * resume, runtime cpu online and on an explicit request to set a
298 * If we're doing that @reg is set to something special... 299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
299 */ 310 */
300 if (reg != ~0U) 311 if (reg != ~0U)
301 mtrr_if->set(reg, base, size, type); 312 mtrr_if->set(reg, base, size, type);
302 else if (!mtrr_aps_delayed_init) 313 else
303 mtrr_if->set_all(); 314 mtrr_if->set_all();
304 315
305 /* Wait for the others */ 316 /* Wait for the others */
@@ -630,7 +641,7 @@ struct mtrr_value {
630 641
631static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 642static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
632 643
633static int mtrr_save(struct sys_device *sysdev, pm_message_t state) 644static int mtrr_save(void)
634{ 645{
635 int i; 646 int i;
636 647
@@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
642 return 0; 653 return 0;
643} 654}
644 655
645static int mtrr_restore(struct sys_device *sysdev) 656static void mtrr_restore(void)
646{ 657{
647 int i; 658 int i;
648 659
@@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev)
653 mtrr_value[i].ltype); 664 mtrr_value[i].ltype);
654 } 665 }
655 } 666 }
656 return 0;
657} 667}
658 668
659 669
660 670
661static struct sysdev_driver mtrr_sysdev_driver = { 671static struct syscore_ops mtrr_syscore_ops = {
662 .suspend = mtrr_save, 672 .suspend = mtrr_save,
663 .resume = mtrr_restore, 673 .resume = mtrr_restore,
664}; 674};
@@ -793,13 +803,21 @@ void set_mtrr_aps_delayed_init(void)
793} 803}
794 804
795/* 805/*
796 * MTRR initialization for all AP's 806 * Delayed MTRR initialization for all AP's
797 */ 807 */
798void mtrr_aps_init(void) 808void mtrr_aps_init(void)
799{ 809{
800 if (!use_intel()) 810 if (!use_intel())
801 return; 811 return;
802 812
813 /*
814 * Check if someone has requested the delay of AP MTRR initialization,
815 * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
816 * then we are done.
817 */
818 if (!mtrr_aps_delayed_init)
819 return;
820
803 set_mtrr(~0U, 0, 0, 0); 821 set_mtrr(~0U, 0, 0, 0);
804 mtrr_aps_delayed_init = false; 822 mtrr_aps_delayed_init = false;
805} 823}
@@ -831,7 +849,7 @@ static int __init mtrr_init_finialize(void)
831 * TBD: is there any system with such CPU which supports 849 * TBD: is there any system with such CPU which supports
832 * suspend/resume? If no, we should remove the code. 850 * suspend/resume? If no, we should remove the code.
833 */ 851 */
834 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); 852 register_syscore_ops(&mtrr_syscore_ops);
835 853
836 return 0; 854 return 0;
837} 855}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea693..eed3673a8656 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,7 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
33 34
34#if 0 35#if 0
35#undef wrmsrl 36#undef wrmsrl
@@ -93,6 +94,8 @@ struct amd_nb {
93 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
94}; 95};
95 96
97struct intel_percore;
98
96#define MAX_LBR_ENTRIES 16 99#define MAX_LBR_ENTRIES 16
97 100
98struct cpu_hw_events { 101struct cpu_hw_events {
@@ -128,6 +131,13 @@ struct cpu_hw_events {
128 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 131 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
129 132
130 /* 133 /*
134 * Intel percore register state.
135 * Coordinate shared resources between HT threads.
136 */
137 int percore_used; /* Used by this CPU? */
138 struct intel_percore *per_core;
139
140 /*
131 * AMD specific bits 141 * AMD specific bits
132 */ 142 */
133 struct amd_nb *amd_nb; 143 struct amd_nb *amd_nb;
@@ -166,7 +176,7 @@ struct cpu_hw_events {
166/* 176/*
167 * Constraint on the Event code + UMask 177 * Constraint on the Event code + UMask
168 */ 178 */
169#define PEBS_EVENT_CONSTRAINT(c, n) \ 179#define INTEL_UEVENT_CONSTRAINT(c, n) \
170 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 180 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
171 181
172#define EVENT_CONSTRAINT_END \ 182#define EVENT_CONSTRAINT_END \
@@ -175,6 +185,28 @@ struct cpu_hw_events {
175#define for_each_event_constraint(e, c) \ 185#define for_each_event_constraint(e, c) \
176 for ((e) = (c); (e)->weight; (e)++) 186 for ((e) = (c); (e)->weight; (e)++)
177 187
188/*
189 * Extra registers for specific events.
190 * Some events need large masks and require external MSRs.
191 * Define a mapping to these extra registers.
192 */
193struct extra_reg {
194 unsigned int event;
195 unsigned int msr;
196 u64 config_mask;
197 u64 valid_mask;
198};
199
200#define EVENT_EXTRA_REG(e, ms, m, vm) { \
201 .event = (e), \
202 .msr = (ms), \
203 .config_mask = (m), \
204 .valid_mask = (vm), \
205 }
206#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
207 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
208#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
209
178union perf_capabilities { 210union perf_capabilities {
179 struct { 211 struct {
180 u64 lbr_format : 6; 212 u64 lbr_format : 6;
@@ -219,6 +251,7 @@ struct x86_pmu {
219 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 251 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
220 struct perf_event *event); 252 struct perf_event *event);
221 struct event_constraint *event_constraints; 253 struct event_constraint *event_constraints;
254 struct event_constraint *percore_constraints;
222 void (*quirks)(void); 255 void (*quirks)(void);
223 int perfctr_second_write; 256 int perfctr_second_write;
224 257
@@ -247,6 +280,11 @@ struct x86_pmu {
247 */ 280 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 281 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 282 int lbr_nr; /* hardware stack size */
283
284 /*
285 * Extra registers for events
286 */
287 struct extra_reg *extra_regs;
250}; 288};
251 289
252static struct x86_pmu x86_pmu __read_mostly; 290static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +309,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 309 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 310 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 311 [PERF_COUNT_HW_CACHE_RESULT_MAX];
312static u64 __read_mostly hw_cache_extra_regs
313 [PERF_COUNT_HW_CACHE_MAX]
314 [PERF_COUNT_HW_CACHE_OP_MAX]
315 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 316
275/* 317/*
276 * Propagate event elapsed time into the generic event. 318 * Propagate event elapsed time into the generic event.
@@ -298,7 +340,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 340 */
299again: 341again:
300 prev_raw_count = local64_read(&hwc->prev_count); 342 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 343 rdmsrl(hwc->event_base, new_raw_count);
302 344
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 345 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 346 new_raw_count) != prev_raw_count)
@@ -321,6 +363,49 @@ again:
321 return new_raw_count; 363 return new_raw_count;
322} 364}
323 365
366/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
367static inline int x86_pmu_addr_offset(int index)
368{
369 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
370 return index << 1;
371 return index;
372}
373
374static inline unsigned int x86_pmu_config_addr(int index)
375{
376 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
377}
378
379static inline unsigned int x86_pmu_event_addr(int index)
380{
381 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
382}
383
384/*
385 * Find and validate any extra registers to set up.
386 */
387static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
388{
389 struct extra_reg *er;
390
391 event->hw.extra_reg = 0;
392 event->hw.extra_config = 0;
393
394 if (!x86_pmu.extra_regs)
395 return 0;
396
397 for (er = x86_pmu.extra_regs; er->msr; er++) {
398 if (er->event != (config & er->config_mask))
399 continue;
400 if (event->attr.config1 & ~er->valid_mask)
401 return -EINVAL;
402 event->hw.extra_reg = er->msr;
403 event->hw.extra_config = event->attr.config1;
404 break;
405 }
406 return 0;
407}
408
324static atomic_t active_events; 409static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 410static DEFINE_MUTEX(pmc_reserve_mutex);
326 411
@@ -331,12 +416,12 @@ static bool reserve_pmc_hardware(void)
331 int i; 416 int i;
332 417
333 for (i = 0; i < x86_pmu.num_counters; i++) { 418 for (i = 0; i < x86_pmu.num_counters; i++) {
334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 419 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
335 goto perfctr_fail; 420 goto perfctr_fail;
336 } 421 }
337 422
338 for (i = 0; i < x86_pmu.num_counters; i++) { 423 for (i = 0; i < x86_pmu.num_counters; i++) {
339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 424 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
340 goto eventsel_fail; 425 goto eventsel_fail;
341 } 426 }
342 427
@@ -344,13 +429,13 @@ static bool reserve_pmc_hardware(void)
344 429
345eventsel_fail: 430eventsel_fail:
346 for (i--; i >= 0; i--) 431 for (i--; i >= 0; i--)
347 release_evntsel_nmi(x86_pmu.eventsel + i); 432 release_evntsel_nmi(x86_pmu_config_addr(i));
348 433
349 i = x86_pmu.num_counters; 434 i = x86_pmu.num_counters;
350 435
351perfctr_fail: 436perfctr_fail:
352 for (i--; i >= 0; i--) 437 for (i--; i >= 0; i--)
353 release_perfctr_nmi(x86_pmu.perfctr + i); 438 release_perfctr_nmi(x86_pmu_event_addr(i));
354 439
355 return false; 440 return false;
356} 441}
@@ -360,8 +445,8 @@ static void release_pmc_hardware(void)
360 int i; 445 int i;
361 446
362 for (i = 0; i < x86_pmu.num_counters; i++) { 447 for (i = 0; i < x86_pmu.num_counters; i++) {
363 release_perfctr_nmi(x86_pmu.perfctr + i); 448 release_perfctr_nmi(x86_pmu_event_addr(i));
364 release_evntsel_nmi(x86_pmu.eventsel + i); 449 release_evntsel_nmi(x86_pmu_config_addr(i));
365 } 450 }
366} 451}
367 452
@@ -382,7 +467,7 @@ static bool check_hw_exists(void)
382 * complain and bail. 467 * complain and bail.
383 */ 468 */
384 for (i = 0; i < x86_pmu.num_counters; i++) { 469 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i; 470 reg = x86_pmu_config_addr(i);
386 ret = rdmsrl_safe(reg, &val); 471 ret = rdmsrl_safe(reg, &val);
387 if (ret) 472 if (ret)
388 goto msr_fail; 473 goto msr_fail;
@@ -407,20 +492,25 @@ static bool check_hw_exists(void)
407 * that don't trap on the MSR access and always return 0s. 492 * that don't trap on the MSR access and always return 0s.
408 */ 493 */
409 val = 0xabcdUL; 494 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val); 495 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); 496 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
412 if (ret || val != val_new) 497 if (ret || val != val_new)
413 goto msr_fail; 498 goto msr_fail;
414 499
415 return true; 500 return true;
416 501
417bios_fail: 502bios_fail:
418 printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); 503 /*
504 * We still allow the PMU driver to operate:
505 */
506 printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
419 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); 507 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
420 return false; 508
509 return true;
421 510
422msr_fail: 511msr_fail:
423 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 512 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
513
424 return false; 514 return false;
425} 515}
426 516
@@ -442,8 +532,9 @@ static inline int x86_pmu_initialized(void)
442} 532}
443 533
444static inline int 534static inline int
445set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 535set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
446{ 536{
537 struct perf_event_attr *attr = &event->attr;
447 unsigned int cache_type, cache_op, cache_result; 538 unsigned int cache_type, cache_op, cache_result;
448 u64 config, val; 539 u64 config, val;
449 540
@@ -470,8 +561,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
470 return -EINVAL; 561 return -EINVAL;
471 562
472 hwc->config |= val; 563 hwc->config |= val;
473 564 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
474 return 0; 565 return x86_pmu_extra_regs(val, event);
475} 566}
476 567
477static int x86_setup_perfctr(struct perf_event *event) 568static int x86_setup_perfctr(struct perf_event *event)
@@ -496,10 +587,10 @@ static int x86_setup_perfctr(struct perf_event *event)
496 } 587 }
497 588
498 if (attr->type == PERF_TYPE_RAW) 589 if (attr->type == PERF_TYPE_RAW)
499 return 0; 590 return x86_pmu_extra_regs(event->attr.config, event);
500 591
501 if (attr->type == PERF_TYPE_HW_CACHE) 592 if (attr->type == PERF_TYPE_HW_CACHE)
502 return set_ext_hw_attr(hwc, attr); 593 return set_ext_hw_attr(hwc, event);
503 594
504 if (attr->config >= x86_pmu.max_events) 595 if (attr->config >= x86_pmu.max_events)
505 return -EINVAL; 596 return -EINVAL;
@@ -617,11 +708,11 @@ static void x86_pmu_disable_all(void)
617 708
618 if (!test_bit(idx, cpuc->active_mask)) 709 if (!test_bit(idx, cpuc->active_mask))
619 continue; 710 continue;
620 rdmsrl(x86_pmu.eventsel + idx, val); 711 rdmsrl(x86_pmu_config_addr(idx), val);
621 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 712 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
622 continue; 713 continue;
623 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 714 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
624 wrmsrl(x86_pmu.eventsel + idx, val); 715 wrmsrl(x86_pmu_config_addr(idx), val);
625 } 716 }
626} 717}
627 718
@@ -642,21 +733,26 @@ static void x86_pmu_disable(struct pmu *pmu)
642 x86_pmu.disable_all(); 733 x86_pmu.disable_all();
643} 734}
644 735
736static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
737 u64 enable_mask)
738{
739 if (hwc->extra_reg)
740 wrmsrl(hwc->extra_reg, hwc->extra_config);
741 wrmsrl(hwc->config_base, hwc->config | enable_mask);
742}
743
645static void x86_pmu_enable_all(int added) 744static void x86_pmu_enable_all(int added)
646{ 745{
647 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 746 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
648 int idx; 747 int idx;
649 748
650 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
651 struct perf_event *event = cpuc->events[idx]; 750 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
652 u64 val;
653 751
654 if (!test_bit(idx, cpuc->active_mask)) 752 if (!test_bit(idx, cpuc->active_mask))
655 continue; 753 continue;
656 754
657 val = event->hw.config; 755 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
658 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
659 wrmsrl(x86_pmu.eventsel + idx, val);
660 } 756 }
661} 757}
662 758
@@ -821,15 +917,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
821 hwc->event_base = 0; 917 hwc->event_base = 0;
822 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 918 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
823 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 919 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
824 /* 920 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
825 * We set it so that event_base + idx in wrmsr/rdmsr maps to
826 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
827 */
828 hwc->event_base =
829 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
830 } else { 921 } else {
831 hwc->config_base = x86_pmu.eventsel; 922 hwc->config_base = x86_pmu_config_addr(hwc->idx);
832 hwc->event_base = x86_pmu.perfctr; 923 hwc->event_base = x86_pmu_event_addr(hwc->idx);
833 } 924 }
834} 925}
835 926
@@ -915,17 +1006,11 @@ static void x86_pmu_enable(struct pmu *pmu)
915 x86_pmu.enable_all(added); 1006 x86_pmu.enable_all(added);
916} 1007}
917 1008
918static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
919 u64 enable_mask)
920{
921 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
922}
923
924static inline void x86_pmu_disable_event(struct perf_event *event) 1009static inline void x86_pmu_disable_event(struct perf_event *event)
925{ 1010{
926 struct hw_perf_event *hwc = &event->hw; 1011 struct hw_perf_event *hwc = &event->hw;
927 1012
928 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1013 wrmsrl(hwc->config_base, hwc->config);
929} 1014}
930 1015
931static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1016static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -978,7 +1063,7 @@ x86_perf_event_set_period(struct perf_event *event)
978 */ 1063 */
979 local64_set(&hwc->prev_count, (u64)-left); 1064 local64_set(&hwc->prev_count, (u64)-left);
980 1065
981 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1066 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
982 1067
983 /* 1068 /*
984 * Due to erratum on certan cpu we need 1069 * Due to erratum on certan cpu we need
@@ -986,7 +1071,7 @@ x86_perf_event_set_period(struct perf_event *event)
986 * is updated properly 1071 * is updated properly
987 */ 1072 */
988 if (x86_pmu.perfctr_second_write) { 1073 if (x86_pmu.perfctr_second_write) {
989 wrmsrl(hwc->event_base + idx, 1074 wrmsrl(hwc->event_base,
990 (u64)(-left) & x86_pmu.cntval_mask); 1075 (u64)(-left) & x86_pmu.cntval_mask);
991 } 1076 }
992 1077
@@ -1029,7 +1114,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1029 1114
1030 /* 1115 /*
1031 * If group events scheduling transaction was started, 1116 * If group events scheduling transaction was started,
1032 * skip the schedulability test here, it will be peformed 1117 * skip the schedulability test here, it will be performed
1033 * at commit time (->commit_txn) as a whole 1118 * at commit time (->commit_txn) as a whole
1034 */ 1119 */
1035 if (cpuc->group_flag & PERF_EVENT_TXN) 1120 if (cpuc->group_flag & PERF_EVENT_TXN)
@@ -1113,8 +1198,8 @@ void perf_event_print_debug(void)
1113 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1198 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1114 1199
1115 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1200 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1116 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1201 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1117 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1202 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1118 1203
1119 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1204 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1120 1205
@@ -1389,7 +1474,7 @@ static void __init pmu_check_apic(void)
1389 pr_info("no hardware sampling interrupt available.\n"); 1474 pr_info("no hardware sampling interrupt available.\n");
1390} 1475}
1391 1476
1392int __init init_hw_perf_events(void) 1477static int __init init_hw_perf_events(void)
1393{ 1478{
1394 struct event_constraint *c; 1479 struct event_constraint *c;
1395 int err; 1480 int err;
@@ -1608,7 +1693,7 @@ out:
1608 return ret; 1693 return ret;
1609} 1694}
1610 1695
1611int x86_pmu_event_init(struct perf_event *event) 1696static int x86_pmu_event_init(struct perf_event *event)
1612{ 1697{
1613 struct pmu *tmp; 1698 struct pmu *tmp;
1614 int err; 1699 int err;
@@ -1710,7 +1795,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1710 1795
1711 perf_callchain_store(entry, regs->ip); 1796 perf_callchain_store(entry, regs->ip);
1712 1797
1713 dump_trace(NULL, regs, NULL, &backtrace_ops, entry); 1798 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1714} 1799}
1715 1800
1716#ifdef CONFIG_COMPAT 1801#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67e2202a6039..cf4e369cea67 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
8 [ C(L1D) ] = { 8 [ C(L1D) ] = {
9 [ C(OP_READ) ] = { 9 [ C(OP_READ) ] = {
10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
11 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 11 [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
12 }, 12 },
13 [ C(OP_WRITE) ] = { 13 [ C(OP_WRITE) ] = {
14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ 14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
127/* 127/*
128 * AMD64 events are detected based on their event codes. 128 * AMD64 events are detected based on their event codes.
129 */ 129 */
130static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
131{
132 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
133}
134
130static inline int amd_is_nb_event(struct hw_perf_event *hwc) 135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
131{ 136{
132 return (hwc->config & 0xe0) == 0xe0; 137 return (hwc->config & 0xe0) == 0xe0;
@@ -385,13 +390,195 @@ static __initconst const struct x86_pmu amd_pmu = {
385 .cpu_dead = amd_pmu_cpu_dead, 390 .cpu_dead = amd_pmu_cpu_dead,
386}; 391};
387 392
393/* AMD Family 15h */
394
395#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
396
397#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
398#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
399#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
400#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
401#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
402#define AMD_EVENT_EX_LS 0x000000C0ULL
403#define AMD_EVENT_DE 0x000000D0ULL
404#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
405
406/*
407 * AMD family 15h event code/PMC mappings:
408 *
409 * type = event_code & 0x0F0:
410 *
411 * 0x000 FP PERF_CTL[5:3]
412 * 0x010 FP PERF_CTL[5:3]
413 * 0x020 LS PERF_CTL[5:0]
414 * 0x030 LS PERF_CTL[5:0]
415 * 0x040 DC PERF_CTL[5:0]
416 * 0x050 DC PERF_CTL[5:0]
417 * 0x060 CU PERF_CTL[2:0]
418 * 0x070 CU PERF_CTL[2:0]
419 * 0x080 IC/DE PERF_CTL[2:0]
420 * 0x090 IC/DE PERF_CTL[2:0]
421 * 0x0A0 ---
422 * 0x0B0 ---
423 * 0x0C0 EX/LS PERF_CTL[5:0]
424 * 0x0D0 DE PERF_CTL[2:0]
425 * 0x0E0 NB NB_PERF_CTL[3:0]
426 * 0x0F0 NB NB_PERF_CTL[3:0]
427 *
428 * Exceptions:
429 *
430 * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
431 * 0x003 FP PERF_CTL[3]
432 * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
433 * 0x00B FP PERF_CTL[3]
434 * 0x00D FP PERF_CTL[3]
435 * 0x023 DE PERF_CTL[2:0]
436 * 0x02D LS PERF_CTL[3]
437 * 0x02E LS PERF_CTL[3,0]
438 * 0x043 CU PERF_CTL[2:0]
439 * 0x045 CU PERF_CTL[2:0]
440 * 0x046 CU PERF_CTL[2:0]
441 * 0x054 CU PERF_CTL[2:0]
442 * 0x055 CU PERF_CTL[2:0]
443 * 0x08F IC PERF_CTL[0]
444 * 0x187 DE PERF_CTL[0]
445 * 0x188 DE PERF_CTL[0]
446 * 0x0DB EX PERF_CTL[5:0]
447 * 0x0DC LS PERF_CTL[5:0]
448 * 0x0DD LS PERF_CTL[5:0]
449 * 0x0DE LS PERF_CTL[5:0]
450 * 0x0DF LS PERF_CTL[5:0]
451 * 0x1D6 EX PERF_CTL[5:0]
452 * 0x1D8 EX PERF_CTL[5:0]
453 *
454 * (*) depending on the umask all FPU counters may be used
455 */
456
457static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
458static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
459static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
460static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
461static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
462static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
463
464static struct event_constraint *
465amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
466{
467 struct hw_perf_event *hwc = &event->hw;
468 unsigned int event_code = amd_get_event_code(hwc);
469
470 switch (event_code & AMD_EVENT_TYPE_MASK) {
471 case AMD_EVENT_FP:
472 switch (event_code) {
473 case 0x000:
474 if (!(hwc->config & 0x0000F000ULL))
475 break;
476 if (!(hwc->config & 0x00000F00ULL))
477 break;
478 return &amd_f15_PMC3;
479 case 0x004:
480 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
481 break;
482 return &amd_f15_PMC3;
483 case 0x003:
484 case 0x00B:
485 case 0x00D:
486 return &amd_f15_PMC3;
487 }
488 return &amd_f15_PMC53;
489 case AMD_EVENT_LS:
490 case AMD_EVENT_DC:
491 case AMD_EVENT_EX_LS:
492 switch (event_code) {
493 case 0x023:
494 case 0x043:
495 case 0x045:
496 case 0x046:
497 case 0x054:
498 case 0x055:
499 return &amd_f15_PMC20;
500 case 0x02D:
501 return &amd_f15_PMC3;
502 case 0x02E:
503 return &amd_f15_PMC30;
504 default:
505 return &amd_f15_PMC50;
506 }
507 case AMD_EVENT_CU:
508 case AMD_EVENT_IC_DE:
509 case AMD_EVENT_DE:
510 switch (event_code) {
511 case 0x08F:
512 case 0x187:
513 case 0x188:
514 return &amd_f15_PMC0;
515 case 0x0DB ... 0x0DF:
516 case 0x1D6:
517 case 0x1D8:
518 return &amd_f15_PMC50;
519 default:
520 return &amd_f15_PMC20;
521 }
522 case AMD_EVENT_NB:
523 /* not yet implemented */
524 return &emptyconstraint;
525 default:
526 return &emptyconstraint;
527 }
528}
529
530static __initconst const struct x86_pmu amd_pmu_f15h = {
531 .name = "AMD Family 15h",
532 .handle_irq = x86_pmu_handle_irq,
533 .disable_all = x86_pmu_disable_all,
534 .enable_all = x86_pmu_enable_all,
535 .enable = x86_pmu_enable_event,
536 .disable = x86_pmu_disable_event,
537 .hw_config = amd_pmu_hw_config,
538 .schedule_events = x86_schedule_events,
539 .eventsel = MSR_F15H_PERF_CTL,
540 .perfctr = MSR_F15H_PERF_CTR,
541 .event_map = amd_pmu_event_map,
542 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
543 .num_counters = 6,
544 .cntval_bits = 48,
545 .cntval_mask = (1ULL << 48) - 1,
546 .apic = 1,
547 /* use highest bit to detect overflow */
548 .max_period = (1ULL << 47) - 1,
549 .get_event_constraints = amd_get_event_constraints_f15h,
550 /* nortbridge counters not yet implemented: */
551#if 0
552 .put_event_constraints = amd_put_event_constraints,
553
554 .cpu_prepare = amd_pmu_cpu_prepare,
555 .cpu_starting = amd_pmu_cpu_starting,
556 .cpu_dead = amd_pmu_cpu_dead,
557#endif
558};
559
388static __init int amd_pmu_init(void) 560static __init int amd_pmu_init(void)
389{ 561{
390 /* Performance-monitoring supported from K7 and later: */ 562 /* Performance-monitoring supported from K7 and later: */
391 if (boot_cpu_data.x86 < 6) 563 if (boot_cpu_data.x86 < 6)
392 return -ENODEV; 564 return -ENODEV;
393 565
394 x86_pmu = amd_pmu; 566 /*
567 * If core performance counter extensions exists, it must be
568 * family 15h, otherwise fail. See x86_pmu_addr_offset().
569 */
570 switch (boot_cpu_data.x86) {
571 case 0x15:
572 if (!cpu_has_perfctr_core)
573 return -ENODEV;
574 x86_pmu = amd_pmu_f15h;
575 break;
576 default:
577 if (cpu_has_perfctr_core)
578 return -ENODEV;
579 x86_pmu = amd_pmu;
580 break;
581 }
395 582
396 /* Events are common for all AMDs */ 583 /* Events are common for all AMDs */
397 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 584 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c1d79c..8fc2b2cee1da 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,27 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
89static struct extra_reg intel_nehalem_extra_regs[] =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
67static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +110,33 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
113static struct event_constraint intel_snb_event_constraints[] =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
79static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +150,106 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 /*
188 * TBD: Need Off-core Response Performance Monitoring support
189 */
190 [ C(OP_READ) ] = {
191 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
192 [ C(RESULT_ACCESS) ] = 0x01b7,
193 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
194 [ C(RESULT_MISS) ] = 0x01bb,
195 },
196 [ C(OP_WRITE) ] = {
197 /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
198 [ C(RESULT_ACCESS) ] = 0x01b7,
199 /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
200 [ C(RESULT_MISS) ] = 0x01bb,
201 },
202 [ C(OP_PREFETCH) ] = {
203 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
204 [ C(RESULT_ACCESS) ] = 0x01b7,
205 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
206 [ C(RESULT_MISS) ] = 0x01bb,
207 },
208 },
209 [ C(DTLB) ] = {
210 [ C(OP_READ) ] = {
211 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
212 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
213 },
214 [ C(OP_WRITE) ] = {
215 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
216 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
217 },
218 [ C(OP_PREFETCH) ] = {
219 [ C(RESULT_ACCESS) ] = 0x0,
220 [ C(RESULT_MISS) ] = 0x0,
221 },
222 },
223 [ C(ITLB) ] = {
224 [ C(OP_READ) ] = {
225 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
226 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
227 },
228 [ C(OP_WRITE) ] = {
229 [ C(RESULT_ACCESS) ] = -1,
230 [ C(RESULT_MISS) ] = -1,
231 },
232 [ C(OP_PREFETCH) ] = {
233 [ C(RESULT_ACCESS) ] = -1,
234 [ C(RESULT_MISS) ] = -1,
235 },
236 },
237 [ C(BPU ) ] = {
238 [ C(OP_READ) ] = {
239 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
240 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
241 },
242 [ C(OP_WRITE) ] = {
243 [ C(RESULT_ACCESS) ] = -1,
244 [ C(RESULT_MISS) ] = -1,
245 },
246 [ C(OP_PREFETCH) ] = {
247 [ C(RESULT_ACCESS) ] = -1,
248 [ C(RESULT_MISS) ] = -1,
249 },
250 },
251};
252
92static __initconst const u64 westmere_hw_cache_event_ids 253static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 254 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 255 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +285,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 285 },
125 [ C(LL ) ] = { 286 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 287 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 288 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 289 [ C(RESULT_ACCESS) ] = 0x01b7,
290 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
291 [ C(RESULT_MISS) ] = 0x01bb,
129 }, 292 },
293 /*
294 * Use RFO, not WRITEBACK, because a write miss would typically occur
295 * on RFO.
296 */
130 [ C(OP_WRITE) ] = { 297 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 298 /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 299 [ C(RESULT_ACCESS) ] = 0x01bb,
300 /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
301 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 302 },
134 [ C(OP_PREFETCH) ] = { 303 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 304 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 305 [ C(RESULT_ACCESS) ] = 0x01b7,
306 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
307 [ C(RESULT_MISS) ] = 0x01bb,
137 }, 308 },
138 }, 309 },
139 [ C(DTLB) ] = { 310 [ C(DTLB) ] = {
@@ -180,6 +351,39 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 351 },
181}; 352};
182 353
354/*
355 * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
356 */
357
358#define DMND_DATA_RD (1 << 0)
359#define DMND_RFO (1 << 1)
360#define DMND_WB (1 << 3)
361#define PF_DATA_RD (1 << 4)
362#define PF_DATA_RFO (1 << 5)
363#define RESP_UNCORE_HIT (1 << 8)
364#define RESP_MISS (0xf600) /* non uncore hit */
365
366static __initconst const u64 nehalem_hw_cache_extra_regs
367 [PERF_COUNT_HW_CACHE_MAX]
368 [PERF_COUNT_HW_CACHE_OP_MAX]
369 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
370{
371 [ C(LL ) ] = {
372 [ C(OP_READ) ] = {
373 [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
374 [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
375 },
376 [ C(OP_WRITE) ] = {
377 [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
378 [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
379 },
380 [ C(OP_PREFETCH) ] = {
381 [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
382 [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
383 },
384 }
385};
386
183static __initconst const u64 nehalem_hw_cache_event_ids 387static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 388 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 389 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -215,16 +419,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 419 },
216 [ C(LL ) ] = { 420 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 421 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 422 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 423 [ C(RESULT_ACCESS) ] = 0x01b7,
424 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
425 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 426 },
427 /*
428 * Use RFO, not WRITEBACK, because a write miss would typically occur
429 * on RFO.
430 */
221 [ C(OP_WRITE) ] = { 431 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 432 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 433 [ C(RESULT_ACCESS) ] = 0x01b7,
434 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
435 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 436 },
225 [ C(OP_PREFETCH) ] = { 437 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 438 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 439 [ C(RESULT_ACCESS) ] = 0x01b7,
440 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
441 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 442 },
229 }, 443 },
230 [ C(DTLB) ] = { 444 [ C(DTLB) ] = {
@@ -691,8 +905,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 905 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 906
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 907 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 908 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 909 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 910 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 911 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 912 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -794,6 +1008,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1008}
795 1009
796static struct event_constraint * 1010static struct event_constraint *
1011intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1012{
1013 struct hw_perf_event *hwc = &event->hw;
1014 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1015 struct event_constraint *c;
1016 struct intel_percore *pc;
1017 struct er_account *era;
1018 int i;
1019 int free_slot;
1020 int found;
1021
1022 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1023 return NULL;
1024
1025 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1026 if (e != c->code)
1027 continue;
1028
1029 /*
1030 * Allocate resource per core.
1031 */
1032 pc = cpuc->per_core;
1033 if (!pc)
1034 break;
1035 c = &emptyconstraint;
1036 raw_spin_lock(&pc->lock);
1037 free_slot = -1;
1038 found = 0;
1039 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1040 era = &pc->regs[i];
1041 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1042 /* Allow sharing same config */
1043 if (hwc->extra_config == era->extra_config) {
1044 era->ref++;
1045 cpuc->percore_used = 1;
1046 hwc->extra_alloc = 1;
1047 c = NULL;
1048 }
1049 /* else conflict */
1050 found = 1;
1051 break;
1052 } else if (era->ref == 0 && free_slot == -1)
1053 free_slot = i;
1054 }
1055 if (!found && free_slot != -1) {
1056 era = &pc->regs[free_slot];
1057 era->ref = 1;
1058 era->extra_reg = hwc->extra_reg;
1059 era->extra_config = hwc->extra_config;
1060 cpuc->percore_used = 1;
1061 hwc->extra_alloc = 1;
1062 c = NULL;
1063 }
1064 raw_spin_unlock(&pc->lock);
1065 return c;
1066 }
1067
1068 return NULL;
1069}
1070
1071static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1072intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1073{
799 struct event_constraint *c; 1074 struct event_constraint *c;
@@ -806,9 +1081,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1081 if (c)
807 return c; 1082 return c;
808 1083
1084 c = intel_percore_constraints(cpuc, event);
1085 if (c)
1086 return c;
1087
809 return x86_get_event_constraints(cpuc, event); 1088 return x86_get_event_constraints(cpuc, event);
810} 1089}
811 1090
1091static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1092 struct perf_event *event)
1093{
1094 struct extra_reg *er;
1095 struct intel_percore *pc;
1096 struct er_account *era;
1097 struct hw_perf_event *hwc = &event->hw;
1098 int i, allref;
1099
1100 if (!cpuc->percore_used)
1101 return;
1102
1103 for (er = x86_pmu.extra_regs; er->msr; er++) {
1104 if (er->event != (hwc->config & er->config_mask))
1105 continue;
1106
1107 pc = cpuc->per_core;
1108 raw_spin_lock(&pc->lock);
1109 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1110 era = &pc->regs[i];
1111 if (era->ref > 0 &&
1112 era->extra_config == hwc->extra_config &&
1113 era->extra_reg == er->msr) {
1114 era->ref--;
1115 hwc->extra_alloc = 0;
1116 break;
1117 }
1118 }
1119 allref = 0;
1120 for (i = 0; i < MAX_EXTRA_REGS; i++)
1121 allref += pc->regs[i].ref;
1122 if (allref == 0)
1123 cpuc->percore_used = 0;
1124 raw_spin_unlock(&pc->lock);
1125 break;
1126 }
1127}
1128
812static int intel_pmu_hw_config(struct perf_event *event) 1129static int intel_pmu_hw_config(struct perf_event *event)
813{ 1130{
814 int ret = x86_pmu_hw_config(event); 1131 int ret = x86_pmu_hw_config(event);
@@ -880,20 +1197,67 @@ static __initconst const struct x86_pmu core_pmu = {
880 */ 1197 */
881 .max_period = (1ULL << 31) - 1, 1198 .max_period = (1ULL << 31) - 1,
882 .get_event_constraints = intel_get_event_constraints, 1199 .get_event_constraints = intel_get_event_constraints,
1200 .put_event_constraints = intel_put_event_constraints,
883 .event_constraints = intel_core_event_constraints, 1201 .event_constraints = intel_core_event_constraints,
884}; 1202};
885 1203
1204static int intel_pmu_cpu_prepare(int cpu)
1205{
1206 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1207
1208 if (!cpu_has_ht_siblings())
1209 return NOTIFY_OK;
1210
1211 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1212 GFP_KERNEL, cpu_to_node(cpu));
1213 if (!cpuc->per_core)
1214 return NOTIFY_BAD;
1215
1216 raw_spin_lock_init(&cpuc->per_core->lock);
1217 cpuc->per_core->core_id = -1;
1218 return NOTIFY_OK;
1219}
1220
886static void intel_pmu_cpu_starting(int cpu) 1221static void intel_pmu_cpu_starting(int cpu)
887{ 1222{
1223 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1224 int core_id = topology_core_id(cpu);
1225 int i;
1226
888 init_debug_store_on_cpu(cpu); 1227 init_debug_store_on_cpu(cpu);
889 /* 1228 /*
890 * Deal with CPUs that don't clear their LBRs on power-up. 1229 * Deal with CPUs that don't clear their LBRs on power-up.
891 */ 1230 */
892 intel_pmu_lbr_reset(); 1231 intel_pmu_lbr_reset();
1232
1233 if (!cpu_has_ht_siblings())
1234 return;
1235
1236 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1237 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1238
1239 if (pc && pc->core_id == core_id) {
1240 kfree(cpuc->per_core);
1241 cpuc->per_core = pc;
1242 break;
1243 }
1244 }
1245
1246 cpuc->per_core->core_id = core_id;
1247 cpuc->per_core->refcnt++;
893} 1248}
894 1249
895static void intel_pmu_cpu_dying(int cpu) 1250static void intel_pmu_cpu_dying(int cpu)
896{ 1251{
1252 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1253 struct intel_percore *pc = cpuc->per_core;
1254
1255 if (pc) {
1256 if (pc->core_id == -1 || --pc->refcnt == 0)
1257 kfree(pc);
1258 cpuc->per_core = NULL;
1259 }
1260
897 fini_debug_store_on_cpu(cpu); 1261 fini_debug_store_on_cpu(cpu);
898} 1262}
899 1263
@@ -918,7 +1282,9 @@ static __initconst const struct x86_pmu intel_pmu = {
918 */ 1282 */
919 .max_period = (1ULL << 31) - 1, 1283 .max_period = (1ULL << 31) - 1,
920 .get_event_constraints = intel_get_event_constraints, 1284 .get_event_constraints = intel_get_event_constraints,
1285 .put_event_constraints = intel_put_event_constraints,
921 1286
1287 .cpu_prepare = intel_pmu_cpu_prepare,
922 .cpu_starting = intel_pmu_cpu_starting, 1288 .cpu_starting = intel_pmu_cpu_starting,
923 .cpu_dying = intel_pmu_cpu_dying, 1289 .cpu_dying = intel_pmu_cpu_dying,
924}; 1290};
@@ -1024,6 +1390,7 @@ static __init int intel_pmu_init(void)
1024 intel_pmu_lbr_init_core(); 1390 intel_pmu_lbr_init_core();
1025 1391
1026 x86_pmu.event_constraints = intel_core2_event_constraints; 1392 x86_pmu.event_constraints = intel_core2_event_constraints;
1393 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1027 pr_cont("Core2 events, "); 1394 pr_cont("Core2 events, ");
1028 break; 1395 break;
1029 1396
@@ -1032,11 +1399,16 @@ static __init int intel_pmu_init(void)
1032 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1399 case 46: /* 45 nm nehalem-ex, "Beckton" */
1033 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1400 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1034 sizeof(hw_cache_event_ids)); 1401 sizeof(hw_cache_event_ids));
1402 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1403 sizeof(hw_cache_extra_regs));
1035 1404
1036 intel_pmu_lbr_init_nhm(); 1405 intel_pmu_lbr_init_nhm();
1037 1406
1038 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1407 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1408 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1409 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1039 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1410 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1411 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1040 pr_cont("Nehalem events, "); 1412 pr_cont("Nehalem events, ");
1041 break; 1413 break;
1042 1414
@@ -1047,6 +1419,7 @@ static __init int intel_pmu_init(void)
1047 intel_pmu_lbr_init_atom(); 1419 intel_pmu_lbr_init_atom();
1048 1420
1049 x86_pmu.event_constraints = intel_gen_event_constraints; 1421 x86_pmu.event_constraints = intel_gen_event_constraints;
1422 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1050 pr_cont("Atom events, "); 1423 pr_cont("Atom events, ");
1051 break; 1424 break;
1052 1425
@@ -1054,14 +1427,30 @@ static __init int intel_pmu_init(void)
1054 case 44: /* 32 nm nehalem, "Gulftown" */ 1427 case 44: /* 32 nm nehalem, "Gulftown" */
1055 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1428 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1056 sizeof(hw_cache_event_ids)); 1429 sizeof(hw_cache_event_ids));
1430 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1431 sizeof(hw_cache_extra_regs));
1057 1432
1058 intel_pmu_lbr_init_nhm(); 1433 intel_pmu_lbr_init_nhm();
1059 1434
1060 x86_pmu.event_constraints = intel_westmere_event_constraints; 1435 x86_pmu.event_constraints = intel_westmere_event_constraints;
1436 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1061 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1437 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1438 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1439 x86_pmu.extra_regs = intel_westmere_extra_regs;
1062 pr_cont("Westmere events, "); 1440 pr_cont("Westmere events, ");
1063 break; 1441 break;
1064 1442
1443 case 42: /* SandyBridge */
1444 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 intel_pmu_lbr_init_nhm();
1448
1449 x86_pmu.event_constraints = intel_snb_event_constraints;
1450 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1451 pr_cont("SandyBridge events, ");
1452 break;
1453
1065 default: 1454 default:
1066 /* 1455 /*
1067 * default constraints for v2 and up 1456 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f2b8a0..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -361,30 +361,70 @@ static int intel_pmu_drain_bts_buffer(void)
361/* 361/*
362 * PEBS 362 * PEBS
363 */ 363 */
364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
368 INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
370 EVENT_CONSTRAINT_END
371};
372
373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
377 EVENT_CONSTRAINT_END
378};
364 379
365static struct event_constraint intel_core_pebs_events[] = { 380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
366 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ 381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
367 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
368 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
369 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
370 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
371 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
372 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 387 INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
373 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
374 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 389 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
375 EVENT_CONSTRAINT_END 392 EVENT_CONSTRAINT_END
376}; 393};
377 394
378static struct event_constraint intel_nehalem_pebs_events[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
379 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
380 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
381 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
382 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
383 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
384 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 401 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
385 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 402 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
386 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 403 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
387 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 404 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
405 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
406 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
407 EVENT_CONSTRAINT_END
408};
409
410static struct event_constraint intel_snb_pebs_events[] = {
411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
388 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
389}; 429};
390 430
@@ -695,20 +735,17 @@ static void intel_ds_init(void)
695 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 735 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
696 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 736 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
697 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 737 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
698 x86_pmu.pebs_constraints = intel_core_pebs_events;
699 break; 738 break;
700 739
701 case 1: 740 case 1:
702 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 741 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
703 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 742 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
704 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 743 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
705 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
706 break; 744 break;
707 745
708 default: 746 default:
709 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 747 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
710 x86_pmu.pebs = 0; 748 x86_pmu.pebs = 0;
711 break;
712 } 749 }
713 } 750 }
714} 751}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e56b9bfbabd1..c2520e178d32 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -679,10 +679,10 @@ static int p4_validate_raw_event(struct perf_event *event)
679 */ 679 */
680 680
681 /* 681 /*
682 * if an event is shared accross the logical threads 682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it 683 * the user needs special permissions to be able to use it
684 */ 684 */
685 if (p4_event_bind_map[v].shared) { 685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
687 return -EACCES; 687 return -EACCES;
688 } 688 }
@@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event)
727 event->hw.config = p4_set_ht_bit(event->hw.config); 727 event->hw.config = p4_set_ht_bit(event->hw.config);
728 728
729 if (event->attr.type == PERF_TYPE_RAW) { 729 if (event->attr.type == PERF_TYPE_RAW) {
730 730 struct p4_event_bind *bind;
731 unsigned int esel;
731 /* 732 /*
732 * Clear bits we reserve to be managed by kernel itself 733 * Clear bits we reserve to be managed by kernel itself
733 * and never allowed from a user space 734 * and never allowed from a user space
@@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event)
743 * bits since we keep additional info here (for cache events and etc) 744 * bits since we keep additional info here (for cache events and etc)
744 */ 745 */
745 event->hw.config |= event->attr.config; 746 event->hw.config |= event->attr.config;
747 bind = p4_config_get_bind(event->attr.config);
748 if (!bind) {
749 rc = -EINVAL;
750 goto out;
751 }
752 esel = P4_OPCODE_ESEL(bind->opcode);
753 event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
746 } 754 }
747 755
748 rc = x86_setup_perfctr(event); 756 rc = x86_setup_perfctr(event);
@@ -756,15 +764,21 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
756 u64 v; 764 u64 v;
757 765
758 /* an official way for overflow indication */ 766 /* an official way for overflow indication */
759 rdmsrl(hwc->config_base + hwc->idx, v); 767 rdmsrl(hwc->config_base, v);
760 if (v & P4_CCCR_OVF) { 768 if (v & P4_CCCR_OVF) {
761 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
762 return 1; 770 return 1;
763 } 771 }
764 772
765 /* it might be unflagged overflow */ 773 /*
766 rdmsrl(hwc->event_base + hwc->idx, v); 774 * In some circumstances the overflow might issue an NMI but did
767 if (!(v & ARCH_P4_CNTRVAL_MASK)) 775 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
776 * we simply check for high bit being set, if it's cleared it means
777 * the counter has reached zero value and continued counting before
778 * real NMI signal was received:
779 */
780 rdmsrl(hwc->event_base, v);
781 if (!(v & ARCH_P4_UNFLAGGED_BIT))
768 return 1; 782 return 1;
769 783
770 return 0; 784 return 0;
@@ -777,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
777 * 791 *
778 * It's still allowed that two threads setup same cache 792 * It's still allowed that two threads setup same cache
779 * events so we can't simply clear metrics until we knew 793 * events so we can't simply clear metrics until we knew
780 * noone is depending on us, so we need kind of counter 794 * no one is depending on us, so we need kind of counter
781 * for "ReplayEvent" users. 795 * for "ReplayEvent" users.
782 * 796 *
783 * What is more complex -- RAW events, if user (for some 797 * What is more complex -- RAW events, if user (for some
784 * reason) will pass some cache event metric with improper 798 * reason) will pass some cache event metric with improper
785 * event opcode -- it's fine from hardware point of view 799 * event opcode -- it's fine from hardware point of view
786 * but completely nonsence from "meaning" of such action. 800 * but completely nonsense from "meaning" of such action.
787 * 801 *
788 * So at moment let leave metrics turned on forever -- it's 802 * So at moment let leave metrics turned on forever -- it's
789 * ok for now but need to be revisited! 803 * ok for now but need to be revisited!
@@ -802,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
802 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 816 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
803 * asserted again and again 817 * asserted again and again
804 */ 818 */
805 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 819 (void)checking_wrmsrl(hwc->config_base,
806 (u64)(p4_config_unpack_cccr(hwc->config)) & 820 (u64)(p4_config_unpack_cccr(hwc->config)) &
807 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 821 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
808} 822}
@@ -872,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
872 p4_pmu_enable_pebs(hwc->config); 886 p4_pmu_enable_pebs(hwc->config);
873 887
874 (void)checking_wrmsrl(escr_addr, escr_conf); 888 (void)checking_wrmsrl(escr_addr, escr_conf);
875 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 889 (void)checking_wrmsrl(hwc->config_base,
876 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 890 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
877} 891}
878 892
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d5a236615501..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
46 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
47 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
48 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
49 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
50 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
51 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
70 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
71 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
72 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
73 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
74 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
75 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index d5cd13945d5a..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
14 14
15static void *kdump_buf_page; 15static void *kdump_buf_page;
16 16
17/* Stores the physical address of elf header of crash image. */
18unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
19
20static inline bool is_crashed_pfn_valid(unsigned long pfn) 17static inline bool is_crashed_pfn_valid(unsigned long pfn)
21{ 18{
22#ifndef CONFIG_X86_PAE 19#ifndef CONFIG_X86_PAE
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 994828899e09..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
16/** 13/**
17 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 15 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..706a9fb46a58
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,439 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16
17#include <asm/hpet.h>
18#include <asm/irq_controller.h>
19#include <asm/apic.h>
20#include <asm/pci_x86.h>
21
22__initdata u64 initial_dtb;
23char __initdata cmd_line[COMMAND_LINE_SIZE];
24static LIST_HEAD(irq_domains);
25static DEFINE_RAW_SPINLOCK(big_irq_lock);
26
27int __initdata of_ioapic;
28
29#ifdef CONFIG_X86_IO_APIC
30static void add_interrupt_host(struct irq_domain *ih)
31{
32 unsigned long flags;
33
34 raw_spin_lock_irqsave(&big_irq_lock, flags);
35 list_add(&ih->l, &irq_domains);
36 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
37}
38#endif
39
40static struct irq_domain *get_ih_from_node(struct device_node *controller)
41{
42 struct irq_domain *ih, *found = NULL;
43 unsigned long flags;
44
45 raw_spin_lock_irqsave(&big_irq_lock, flags);
46 list_for_each_entry(ih, &irq_domains, l) {
47 if (ih->controller == controller) {
48 found = ih;
49 break;
50 }
51 }
52 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
53 return found;
54}
55
56unsigned int irq_create_of_mapping(struct device_node *controller,
57 const u32 *intspec, unsigned int intsize)
58{
59 struct irq_domain *ih;
60 u32 virq, type;
61 int ret;
62
63 ih = get_ih_from_node(controller);
64 if (!ih)
65 return 0;
66 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
67 if (ret)
68 return 0;
69 if (type == IRQ_TYPE_NONE)
70 return virq;
71 irq_set_irq_type(virq, type);
72 return virq;
73}
74EXPORT_SYMBOL_GPL(irq_create_of_mapping);
75
76unsigned long pci_address_to_pio(phys_addr_t address)
77{
78 /*
79 * The ioport address can be directly used by inX / outX
80 */
81 BUG_ON(address >= (1 << 16));
82 return (unsigned long)address;
83}
84EXPORT_SYMBOL_GPL(pci_address_to_pio);
85
86void __init early_init_dt_scan_chosen_arch(unsigned long node)
87{
88 BUG();
89}
90
91void __init early_init_dt_add_memory_arch(u64 base, u64 size)
92{
93 BUG();
94}
95
96void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
97{
98 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
99}
100
101void __init add_dtb(u64 data)
102{
103 initial_dtb = data + offsetof(struct setup_data, data);
104}
105
106/*
107 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
108 */
109static struct of_device_id __initdata ce4100_ids[] = {
110 { .compatible = "intel,ce4100-cp", },
111 { .compatible = "isa", },
112 { .compatible = "pci", },
113 {},
114};
115
116static int __init add_bus_probe(void)
117{
118 if (!of_have_populated_dt())
119 return 0;
120
121 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
122}
123module_init(add_bus_probe);
124
125#ifdef CONFIG_PCI
126static int x86_of_pci_irq_enable(struct pci_dev *dev)
127{
128 struct of_irq oirq;
129 u32 virq;
130 int ret;
131 u8 pin;
132
133 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
134 if (ret)
135 return ret;
136 if (!pin)
137 return 0;
138
139 ret = of_irq_map_pci(dev, &oirq);
140 if (ret)
141 return ret;
142
143 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
144 oirq.size);
145 if (virq == 0)
146 return -EINVAL;
147 dev->irq = virq;
148 return 0;
149}
150
151static void x86_of_pci_irq_disable(struct pci_dev *dev)
152{
153}
154
155void __cpuinit x86_of_pci_init(void)
156{
157 struct device_node *np;
158
159 pcibios_enable_irq = x86_of_pci_irq_enable;
160 pcibios_disable_irq = x86_of_pci_irq_disable;
161
162 for_each_node_by_type(np, "pci") {
163 const void *prop;
164 struct pci_bus *bus;
165 unsigned int bus_min;
166 struct device_node *child;
167
168 prop = of_get_property(np, "bus-range", NULL);
169 if (!prop)
170 continue;
171 bus_min = be32_to_cpup(prop);
172
173 bus = pci_find_bus(0, bus_min);
174 if (!bus) {
175 printk(KERN_ERR "Can't find a node for bus %s.\n",
176 np->full_name);
177 continue;
178 }
179
180 if (bus->self)
181 bus->self->dev.of_node = np;
182 else
183 bus->dev.of_node = np;
184
185 for_each_child_of_node(np, child) {
186 struct pci_dev *dev;
187 u32 devfn;
188
189 prop = of_get_property(child, "reg", NULL);
190 if (!prop)
191 continue;
192
193 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
194 dev = pci_get_slot(bus, devfn);
195 if (!dev)
196 continue;
197 dev->dev.of_node = child;
198 pci_dev_put(dev);
199 }
200 }
201}
202#endif
203
204static void __init dtb_setup_hpet(void)
205{
206#ifdef CONFIG_HPET_TIMER
207 struct device_node *dn;
208 struct resource r;
209 int ret;
210
211 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
212 if (!dn)
213 return;
214 ret = of_address_to_resource(dn, 0, &r);
215 if (ret) {
216 WARN_ON(1);
217 return;
218 }
219 hpet_address = r.start;
220#endif
221}
222
223static void __init dtb_lapic_setup(void)
224{
225#ifdef CONFIG_X86_LOCAL_APIC
226 struct device_node *dn;
227 struct resource r;
228 int ret;
229
230 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
231 if (!dn)
232 return;
233
234 ret = of_address_to_resource(dn, 0, &r);
235 if (WARN_ON(ret))
236 return;
237
238 /* Did the boot loader setup the local APIC ? */
239 if (!cpu_has_apic) {
240 if (apic_force_enable(r.start))
241 return;
242 }
243 smp_found_config = 1;
244 pic_mode = 1;
245 register_lapic_address(r.start);
246 generic_processor_info(boot_cpu_physical_apicid,
247 GET_APIC_VERSION(apic_read(APIC_LVR)));
248#endif
249}
250
251#ifdef CONFIG_X86_IO_APIC
252static unsigned int ioapic_id;
253
254static void __init dtb_add_ioapic(struct device_node *dn)
255{
256 struct resource r;
257 int ret;
258
259 ret = of_address_to_resource(dn, 0, &r);
260 if (ret) {
261 printk(KERN_ERR "Can't obtain address from node %s.\n",
262 dn->full_name);
263 return;
264 }
265 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
266}
267
268static void __init dtb_ioapic_setup(void)
269{
270 struct device_node *dn;
271
272 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
273 dtb_add_ioapic(dn);
274
275 if (nr_ioapics) {
276 of_ioapic = 1;
277 return;
278 }
279 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
280}
281#else
282static void __init dtb_ioapic_setup(void) {}
283#endif
284
285static void __init dtb_apic_setup(void)
286{
287 dtb_lapic_setup();
288 dtb_ioapic_setup();
289}
290
291#ifdef CONFIG_OF_FLATTREE
292static void __init x86_flattree_get_config(void)
293{
294 u32 size, map_len;
295 void *new_dtb;
296
297 if (!initial_dtb)
298 return;
299
300 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
301 (u64)sizeof(struct boot_param_header));
302
303 initial_boot_params = early_memremap(initial_dtb, map_len);
304 size = be32_to_cpu(initial_boot_params->totalsize);
305 if (map_len < size) {
306 early_iounmap(initial_boot_params, map_len);
307 initial_boot_params = early_memremap(initial_dtb, size);
308 map_len = size;
309 }
310
311 new_dtb = alloc_bootmem(size);
312 memcpy(new_dtb, initial_boot_params, size);
313 early_iounmap(initial_boot_params, map_len);
314
315 initial_boot_params = new_dtb;
316
317 /* root level address cells */
318 of_scan_flat_dt(early_init_dt_scan_root, NULL);
319
320 unflatten_device_tree();
321}
322#else
323static inline void x86_flattree_get_config(void) { }
324#endif
325
326void __init x86_dtb_init(void)
327{
328 x86_flattree_get_config();
329
330 if (!of_have_populated_dt())
331 return;
332
333 dtb_setup_hpet();
334 dtb_apic_setup();
335}
336
337#ifdef CONFIG_X86_IO_APIC
338
339struct of_ioapic_type {
340 u32 out_type;
341 u32 trigger;
342 u32 polarity;
343};
344
345static struct of_ioapic_type of_ioapic_type[] =
346{
347 {
348 .out_type = IRQ_TYPE_EDGE_RISING,
349 .trigger = IOAPIC_EDGE,
350 .polarity = 1,
351 },
352 {
353 .out_type = IRQ_TYPE_LEVEL_LOW,
354 .trigger = IOAPIC_LEVEL,
355 .polarity = 0,
356 },
357 {
358 .out_type = IRQ_TYPE_LEVEL_HIGH,
359 .trigger = IOAPIC_LEVEL,
360 .polarity = 1,
361 },
362 {
363 .out_type = IRQ_TYPE_EDGE_FALLING,
364 .trigger = IOAPIC_EDGE,
365 .polarity = 0,
366 },
367};
368
369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
370 u32 *out_hwirq, u32 *out_type)
371{
372 struct io_apic_irq_attr attr;
373 struct of_ioapic_type *it;
374 u32 line, idx, type;
375
376 if (intsize < 2)
377 return -EINVAL;
378
379 line = *intspec;
380 idx = (u32) id->priv;
381 *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
382
383 intspec++;
384 type = *intspec;
385
386 if (type >= ARRAY_SIZE(of_ioapic_type))
387 return -EINVAL;
388
389 it = of_ioapic_type + type;
390 *out_type = it->out_type;
391
392 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
393
394 return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
395}
396
397static void __init ioapic_add_ofnode(struct device_node *np)
398{
399 struct resource r;
400 int i, ret;
401
402 ret = of_address_to_resource(np, 0, &r);
403 if (ret) {
404 printk(KERN_ERR "Failed to obtain address for %s\n",
405 np->full_name);
406 return;
407 }
408
409 for (i = 0; i < nr_ioapics; i++) {
410 if (r.start == mp_ioapics[i].apicaddr) {
411 struct irq_domain *id;
412
413 id = kzalloc(sizeof(*id), GFP_KERNEL);
414 BUG_ON(!id);
415 id->controller = np;
416 id->xlate = ioapic_xlate;
417 id->priv = (void *)i;
418 add_interrupt_host(id);
419 return;
420 }
421 }
422 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
423}
424
425void __init x86_add_irq_domains(void)
426{
427 struct device_node *dp;
428
429 if (!of_have_populated_dt())
430 return;
431
432 for_each_node_with_property(dp, "interrupt-controller") {
433 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
434 ioapic_add_ofnode(dp);
435 }
436}
437#else
438void __init x86_add_irq_domains(void) { }
439#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1b..e2a3f0606da4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pS\n", (void *) address, 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 reliable ? "" : "? ", (void *) address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, char *log_lvl) 178 unsigned long *stack, unsigned long bp, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack) 185 unsigned long *stack, unsigned long bp)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, ""); 187 show_trace_log_lvl(task, regs, stack, bp, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, ""); 192 show_stack_log_lvl(task, NULL, sp, 0, "");
193} 193}
194 194
195/* 195/*
@@ -197,14 +197,16 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp;
200 unsigned long stack; 201 unsigned long stack;
201 202
203 bp = stack_frame(current, NULL);
202 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
203 current->pid, current->comm, print_tainted(), 205 current->pid, current->comm, print_tainted(),
204 init_utsname()->release, 206 init_utsname()->release,
205 (int)strcspn(init_utsname()->version, " "), 207 (int)strcspn(init_utsname()->version, " "),
206 init_utsname()->version); 208 init_utsname()->version);
207 show_trace(NULL, NULL, &stack); 209 show_trace(NULL, NULL, &stack, bp);
208} 210}
209EXPORT_SYMBOL(dump_stack); 211EXPORT_SYMBOL(dump_stack);
210 212
@@ -320,41 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err)
320 oops_end(flags, regs, sig); 322 oops_end(flags, regs, sig);
321} 323}
322 324
323void notrace __kprobes
324die_nmi(char *str, struct pt_regs *regs, int do_panic)
325{
326 unsigned long flags;
327
328 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
329 return;
330
331 /*
332 * We are in trouble anyway, lets at least try
333 * to get a message out.
334 */
335 flags = oops_begin();
336 printk(KERN_EMERG "%s", str);
337 printk(" on CPU%d, ip %08lx, registers:\n",
338 smp_processor_id(), regs->ip);
339 show_registers(regs);
340 oops_end(flags, regs, 0);
341 if (do_panic || panic_on_oops)
342 panic("Non maskable interrupt");
343 nmi_exit();
344 local_irq_enable();
345 do_exit(SIGBUS);
346}
347
348static int __init oops_setup(char *s)
349{
350 if (!s)
351 return -EINVAL;
352 if (!strcmp(s, "panic"))
353 panic_on_oops = 1;
354 return 0;
355}
356early_param("oops", oops_setup);
357
358static int __init kstack_setup(char *s) 325static int __init kstack_setup(char *s)
359{ 326{
360 if (!s) 327 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 74cc1eda384b..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,12 +17,11 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
21 struct pt_regs *regs, unsigned long *stack, 21 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
26 25
27 if (!task) 26 if (!task)
28 task = current; 27 task = current;
@@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task,
35 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
36 } 35 }
37 36
38 bp = stack_frame(task, regs); 37 if (!bp)
38 bp = stack_frame(task, regs);
39
39 for (;;) { 40 for (;;) {
40 struct thread_info *context; 41 struct thread_info *context;
41 42
@@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace);
55 56
56void 57void
57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 58show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
58 unsigned long *sp, char *log_lvl) 59 unsigned long *sp, unsigned long bp, char *log_lvl)
59{ 60{
60 unsigned long *stack; 61 unsigned long *stack;
61 int i; 62 int i;
@@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
77 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
78 } 79 }
79 printk(KERN_CONT "\n"); 80 printk(KERN_CONT "\n");
80 show_trace_log_lvl(task, regs, sp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
81} 82}
82 83
83 84
@@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs)
102 u8 *ip; 103 u8 *ip;
103 104
104 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG); 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
106 107
107 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
108 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 64101335de19..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, 142void dump_trace(struct task_struct *task, struct pt_regs *regs,
143 struct pt_regs *regs, unsigned long *stack, 143 unsigned long *stack, unsigned long bp,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -149,19 +149,19 @@ void dump_trace(struct task_struct *task,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long bp; 152 unsigned long dummy;
153 153
154 if (!task) 154 if (!task)
155 task = current; 155 task = current;
156 156
157 if (!stack) { 157 if (!stack) {
158 unsigned long dummy;
159 stack = &dummy; 158 stack = &dummy;
160 if (task && task != current) 159 if (task && task != current)
161 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
162 } 161 }
163 162
164 bp = stack_frame(task, regs); 163 if (!bp)
164 bp = stack_frame(task, regs);
165 /* 165 /*
166 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
167 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
225 225
226void 226void
227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 unsigned long *sp, char *log_lvl) 228 unsigned long *sp, unsigned long bp, char *log_lvl)
229{ 229{
230 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
231 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
269 preempt_enable(); 269 preempt_enable();
270 270
271 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
272 show_trace_log_lvl(task, regs, sp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
273} 273}
274 274
275void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs)
298 298
299 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
301 KERN_EMERG); 301 0, KERN_EMERG);
302 302
303 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
304 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26da0c0c..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <linux/pfn.h> 16#include <linux/pfn.h>
16#include <linux/suspend.h> 17#include <linux/suspend.h>
@@ -667,21 +668,15 @@ __init void e820_setup_gap(void)
667 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 668 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
668 * linked list of struct setup_data, which is parsed here. 669 * linked list of struct setup_data, which is parsed here.
669 */ 670 */
670void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 671void __init parse_e820_ext(struct setup_data *sdata)
671{ 672{
672 u32 map_len;
673 int entries; 673 int entries;
674 struct e820entry *extmap; 674 struct e820entry *extmap;
675 675
676 entries = sdata->len / sizeof(struct e820entry); 676 entries = sdata->len / sizeof(struct e820entry);
677 map_len = sdata->len + sizeof(struct setup_data);
678 if (map_len > PAGE_SIZE)
679 sdata = early_ioremap(pa_data, map_len);
680 extmap = (struct e820entry *)(sdata->data); 677 extmap = (struct e820entry *)(sdata->data);
681 __append_e820_map(extmap, entries); 678 __append_e820_map(extmap, entries);
682 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 679 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
683 if (map_len > PAGE_SIZE)
684 early_iounmap(sdata, map_len);
685 printk(KERN_INFO "extended physical RAM map:\n"); 680 printk(KERN_INFO "extended physical RAM map:\n");
686 e820_print_map("extended"); 681 e820_print_map("extended");
687} 682}
@@ -847,15 +842,21 @@ static int __init parse_memopt(char *p)
847 if (!p) 842 if (!p)
848 return -EINVAL; 843 return -EINVAL;
849 844
850#ifdef CONFIG_X86_32
851 if (!strcmp(p, "nopentium")) { 845 if (!strcmp(p, "nopentium")) {
846#ifdef CONFIG_X86_32
852 setup_clear_cpu_cap(X86_FEATURE_PSE); 847 setup_clear_cpu_cap(X86_FEATURE_PSE);
853 return 0; 848 return 0;
854 } 849#else
850 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
851 return -EINVAL;
855#endif 852#endif
853 }
856 854
857 userdef = 1; 855 userdef = 1;
858 mem_size = memparse(p, &p); 856 mem_size = memparse(p, &p);
857 /* don't remove all of memory when handling "mem={invalid}" param */
858 if (mem_size == 0)
859 return -EINVAL;
859 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 860 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
860 861
861 return 0; 862 return 0;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -143,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func)
143 143
144static u32 __init ati_sbx00_rev(int num, int slot, int func) 144static u32 __init ati_sbx00_rev(int num, int slot, int func)
145{ 145{
146 u32 old, d; 146 u32 d;
147 147
148 d = read_pci_config(num, slot, func, 0x70);
149 old = d;
150 d &= ~(1<<8);
151 write_pci_config(num, slot, func, 0x70, d);
152 d = read_pci_config(num, slot, func, 0x8); 148 d = read_pci_config(num, slot, func, 0x8);
153 d &= 0xff; 149 d &= 0xff;
154 write_pci_config(num, slot, func, 0x70, old);
155 150
156 return d; 151 return d;
157} 152}
@@ -160,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
160{ 155{
161 u32 d, rev; 156 u32 d, rev;
162 157
163 if (acpi_use_timer_override) 158 rev = ati_sbx00_rev(num, slot, func);
159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1;
161
162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
164 return; 168 return;
165 169
166 rev = ati_sbx00_rev(num, slot, func); 170 if (acpi_use_timer_override)
167 if (rev > 0x13)
168 return; 171 return;
169 172
170 /* check for IRQ0 interrupt swap */ 173 /* check for IRQ0 interrupt swap */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efad7ebb..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -395,7 +397,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 399 */
398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
400 402
401 pushl_cfi %eax 403 pushl_cfi %eax
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
788 */ 790 */
789.section .init.rodata,"a" 791.section .init.rodata,"a"
790ENTRY(interrupt) 792ENTRY(interrupt)
791.text 793.section .entry.text, "ax"
792 .p2align 5 794 .p2align 5
793 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
794ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
807 .endif 809 .endif
808 .previous 810 .previous
809 .long 1b 811 .long 1b
810 .text 812 .section .entry.text, "ax"
811vector=vector+1 813vector=vector+1
812 .endif 814 .endif
813 .endr 815 .endr
@@ -1409,11 +1411,10 @@ END(general_protection)
1409#ifdef CONFIG_KVM_GUEST 1411#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault) 1412ENTRY(async_page_fault)
1411 RING0_EC_FRAME 1413 RING0_EC_FRAME
1412 pushl $do_async_page_fault 1414 pushl_cfi $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code 1415 jmp error_code
1415 CFI_ENDPROC 1416 CFI_ENDPROC
1416END(apf_page_fault) 1417END(async_page_fault)
1417#endif 1418#endif
1418 1419
1419/* 1420/*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffbeb0c9..8a445a0c989e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -420,7 +422,7 @@ ENTRY(ret_from_fork)
420END(ret_from_fork) 422END(ret_from_fork)
421 423
422/* 424/*
423 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
424 * 426 *
425 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
426 * stack pointer. 428 * stack pointer.
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
744 */ 746 */
745 .section .init.rodata,"a" 747 .section .init.rodata,"a"
746ENTRY(interrupt) 748ENTRY(interrupt)
747 .text 749 .section .entry.text
748 .p2align 5 750 .p2align 5
749 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
750ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
763 .endif 765 .endif
764 .previous 766 .previous
765 .quad 1b 767 .quad 1b
766 .text 768 .section .entry.text
767vector=vector+1 769vector=vector+1
768 .endif 770 .endif
769 .endr 771 .endr
@@ -975,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
975 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
976 978
977#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
978.irpc idx, "01234567" 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
982.if NUM_INVALIDATE_TLB_VECTORS > \idx
979apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
980 invalidate_interrupt\idx smp_invalidate_interrupt 984 invalidate_interrupt\idx smp_invalidate_interrupt
985.endif
981.endr 986.endr
982#endif 987#endif
983 988
@@ -1248,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1248 decl PER_CPU_VAR(irq_count) 1253 decl PER_CPU_VAR(irq_count)
1249 jmp error_exit 1254 jmp error_exit
1250 CFI_ENDPROC 1255 CFI_ENDPROC
1251END(do_hypervisor_callback) 1256END(xen_do_hypervisor_callback)
1252 1257
1253/* 1258/*
1254 * Hypervisor uses this for application faults while it executes. 1259 * Hypervisor uses this for application faults while it executes.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 382eb2936d4d..a93742a57468 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
437 return; 437 return;
438 } 438 }
439 439
440 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
441 frame_pointer) == -EBUSY) {
442 *parent = old;
443 return;
444 }
445
446 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
447 442
448 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
449 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
450 current->curr_ret_stack--;
451 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
452 } 453 }
453} 454}
454#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7f138b3c3c52..d6d6bb361931 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,15 +34,6 @@ void __init i386_start_kernel(void)
34{ 34{
35 memblock_init(); 35 memblock_init();
36 36
37#ifdef CONFIG_X86_TRAMPOLINE
38 /*
39 * But first pinch a few for the stack/trampoline stuff
40 * FIXME: Don't need the extra page at 4K, but need to fix
41 * trampoline before removing it. (see the GDT stuff)
42 */
43 memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
44#endif
45
46 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 37 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
47 38
48#ifdef CONFIG_BLK_DEV_INITRD 39#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2d2673c28aff..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
77 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
78 zap_identity_mappings(); 78 zap_identity_mappings();
79 79
80 /* Cleanup the over mapped high alias */
81 cleanup_highmap();
82
83 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
84 81
85 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fc293dc8dc35..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
73 */ 73 */
74KERNEL_PAGES = LOWMEM_PAGES 74KERNEL_PAGES = LOWMEM_PAGES
75 75
76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
77RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
78 78
79/* 79/*
@@ -85,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
85 */ 85 */
86__HEAD 86__HEAD
87ENTRY(startup_32) 87ENTRY(startup_32)
88 movl pa(stack_start),%ecx
89
88 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 90 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
89 us to not reload segments */ 91 us to not reload segments */
90 testb $(1<<6), BP_loadflags(%esi) 92 testb $(1<<6), BP_loadflags(%esi)
@@ -99,7 +101,9 @@ ENTRY(startup_32)
99 movl %eax,%es 101 movl %eax,%es
100 movl %eax,%fs 102 movl %eax,%fs
101 movl %eax,%gs 103 movl %eax,%gs
104 movl %eax,%ss
1022: 1052:
106 leal -__PAGE_OFFSET(%ecx),%esp
103 107
104/* 108/*
105 * Clear BSS first so that there are no surprises... 109 * Clear BSS first so that there are no surprises...
@@ -133,7 +137,7 @@ ENTRY(startup_32)
133 movsl 137 movsl
1341: 1381:
135 139
136#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
137 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
138 movl %cr3, %eax 142 movl %cr3, %eax
139 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
@@ -145,8 +149,6 @@ ENTRY(startup_32)
145 * _brk_end is set up to point to the first "safe" location. 149 * _brk_end is set up to point to the first "safe" location.
146 * Mappings are created both at virtual address 0 (identity mapping) 150 * Mappings are created both at virtual address 0 (identity mapping)
147 * and PAGE_OFFSET for up to _end. 151 * and PAGE_OFFSET for up to _end.
148 *
149 * Note that the stack is not yet set up!
150 */ 152 */
151#ifdef CONFIG_X86_PAE 153#ifdef CONFIG_X86_PAE
152 154
@@ -282,6 +284,9 @@ ENTRY(startup_32_smp)
282 movl %eax,%es 284 movl %eax,%es
283 movl %eax,%fs 285 movl %eax,%fs
284 movl %eax,%gs 286 movl %eax,%gs
287 movl pa(stack_start),%ecx
288 movl %eax,%ss
289 leal -__PAGE_OFFSET(%ecx),%esp
285#endif /* CONFIG_SMP */ 290#endif /* CONFIG_SMP */
286default_entry: 291default_entry:
287 292
@@ -347,8 +352,8 @@ default_entry:
347 movl %eax,%cr0 /* ..and set paging (PG) bit */ 352 movl %eax,%cr0 /* ..and set paging (PG) bit */
348 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 353 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
3491: 3541:
350 /* Set up the stack pointer */ 355 /* Shift the stack pointer to a virtual address */
351 lss stack_start,%esp 356 addl $__PAGE_OFFSET, %esp
352 357
353/* 358/*
354 * Initialize eflags. Some BIOS's leave bits like NT set. This would 359 * Initialize eflags. Some BIOS's leave bits like NT set. This would
@@ -360,9 +365,7 @@ default_entry:
360 365
361#ifdef CONFIG_SMP 366#ifdef CONFIG_SMP
362 cmpb $0, ready 367 cmpb $0, ready
363 jz 1f /* Initial CPU cleans BSS */ 368 jnz checkCPUtype
364 jmp checkCPUtype
3651:
366#endif /* CONFIG_SMP */ 369#endif /* CONFIG_SMP */
367 370
368/* 371/*
@@ -470,14 +473,7 @@ is386: movl $2,%ecx # set MP
470 473
471 cld # gcc2 wants the direction flag cleared at all times 474 cld # gcc2 wants the direction flag cleared at all times
472 pushl $0 # fake return address for unwinder 475 pushl $0 # fake return address for unwinder
473#ifdef CONFIG_SMP
474 movb ready, %cl
475 movb $1, ready 476 movb $1, ready
476 cmpb $0,%cl # the first CPU calls start_kernel
477 je 1f
478 movl (stack_start), %esp
4791:
480#endif /* CONFIG_SMP */
481 jmp *(initial_code) 477 jmp *(initial_code)
482 478
483/* 479/*
@@ -627,7 +623,7 @@ ENTRY(initial_code)
627 * BSS section 623 * BSS section
628 */ 624 */
629__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
630 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
631#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
632initial_pg_pmd: 628initial_pg_pmd:
633 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
@@ -648,7 +644,7 @@ ENTRY(swapper_pg_dir)
648#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
649__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
650 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
651 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
652ENTRY(initial_page_table) 648ENTRY(initial_page_table)
653 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
654# if KPMDS == 3 650# if KPMDS == 3
@@ -666,19 +662,19 @@ ENTRY(initial_page_table)
666# else 662# else
667# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
668# endif 664# endif
669 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
670#endif 666#endif
671 667
672.data 668.data
669.balign 4
673ENTRY(stack_start) 670ENTRY(stack_start)
674 .long init_thread_union+THREAD_SIZE 671 .long init_thread_union+THREAD_SIZE
675 .long __BOOT_DS
676
677ready: .byte 0
678 672
679early_recursion_flag: 673early_recursion_flag:
680 .long 0 674 .long 0
681 675
676ready: .byte 0
677
682int_msg: 678int_msg:
683 .asciz "Unknown interrupt or fault at: %p %p %p\n" 679 .asciz "Unknown interrupt or fault at: %p %p %p\n"
684 680
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968f12d2..bfe8f729e086 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
503 if (!irq) 503 if (!irq)
504 return -EINVAL; 504 return -EINVAL;
505 505
506 set_irq_data(irq, dev); 506 irq_set_handler_data(irq, dev);
507 507
508 if (hpet_setup_msi_irq(irq)) 508 if (hpet_setup_msi_irq(irq))
509 return -EINVAL; 509 return -EINVAL;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e60c38cc0eed..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sysdev.h> 13#include <linux/syscore_ops.h>
14 14
15#include <asm/dma.h> 15#include <asm/dma.h>
16 16
@@ -21,7 +21,7 @@
21 * in asm/dma.h. 21 * in asm/dma.h.
22 */ 22 */
23 23
24static int i8237A_resume(struct sys_device *dev) 24static void i8237A_resume(void)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
27 int i; 27 int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
41 enable_dma(4); 41 enable_dma(4);
42 42
43 release_dma_lock(flags); 43 release_dma_lock(flags);
44
45 return 0;
46} 44}
47 45
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state) 46static struct syscore_ops i8237_syscore_ops = {
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 .name = "i8237",
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 47 .resume = i8237A_resume,
57}; 48};
58 49
59static struct sys_device device_i8237A = { 50static int __init i8237A_init_ops(void)
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{ 51{
66 int error = sysdev_class_register(&i8237_sysdev_class); 52 register_syscore_ops(&i8237_syscore_ops);
67 if (!error) 53 return 0;
68 error = sysdev_register(&device_i8237A);
69 return error;
70} 54}
71device_initcall(i8237A_init_sysfs); 55device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 20757cb2efa3..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
8#include <linux/random.h> 8#include <linux/random.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <linux/sysdev.h> 11#include <linux/syscore_ops.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/acpi.h> 13#include <linux/acpi.h>
14#include <linux/io.h> 14#include <linux/io.h>
@@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq)
112{ 112{
113 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
114 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
115 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
116 i8259A_chip.name); 116 i8259A_chip.name);
117 enable_irq(irq); 117 enable_irq(irq);
118} 118}
@@ -245,20 +245,19 @@ static void save_ELCR(char *trigger)
245 trigger[1] = inb(0x4d1) & 0xDE; 245 trigger[1] = inb(0x4d1) & 0xDE;
246} 246}
247 247
248static int i8259A_resume(struct sys_device *dev) 248static void i8259A_resume(void)
249{ 249{
250 init_8259A(i8259A_auto_eoi); 250 init_8259A(i8259A_auto_eoi);
251 restore_ELCR(irq_trigger); 251 restore_ELCR(irq_trigger);
252 return 0;
253} 252}
254 253
255static int i8259A_suspend(struct sys_device *dev, pm_message_t state) 254static int i8259A_suspend(void)
256{ 255{
257 save_ELCR(irq_trigger); 256 save_ELCR(irq_trigger);
258 return 0; 257 return 0;
259} 258}
260 259
261static int i8259A_shutdown(struct sys_device *dev) 260static void i8259A_shutdown(void)
262{ 261{
263 /* Put the i8259A into a quiescent state that 262 /* Put the i8259A into a quiescent state that
264 * the kernel initialization code can get it 263 * the kernel initialization code can get it
@@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
266 */ 265 */
267 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
268 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ 267 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
269 return 0;
270} 268}
271 269
272static struct sysdev_class i8259_sysdev_class = { 270static struct syscore_ops i8259_syscore_ops = {
273 .name = "i8259",
274 .suspend = i8259A_suspend, 271 .suspend = i8259A_suspend,
275 .resume = i8259A_resume, 272 .resume = i8259A_resume,
276 .shutdown = i8259A_shutdown, 273 .shutdown = i8259A_shutdown,
277}; 274};
278 275
279static struct sys_device device_i8259A = {
280 .id = 0,
281 .cls = &i8259_sysdev_class,
282};
283
284static void mask_8259A(void) 276static void mask_8259A(void)
285{ 277{
286 unsigned long flags; 278 unsigned long flags;
@@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
399 391
400struct legacy_pic *legacy_pic = &default_legacy_pic; 392struct legacy_pic *legacy_pic = &default_legacy_pic;
401 393
402static int __init i8259A_init_sysfs(void) 394static int __init i8259A_init_ops(void)
403{ 395{
404 int error; 396 if (legacy_pic == &default_legacy_pic)
405 397 register_syscore_ops(&i8259_syscore_ops);
406 if (legacy_pic != &default_legacy_pic)
407 return 0;
408 398
409 error = sysdev_class_register(&i8259_sysdev_class); 399 return 0;
410 if (!error)
411 error = sysdev_register(&device_i8259A);
412 return error;
413} 400}
414 401
415device_initcall(i8259A_init_sysfs); 402device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 52945da52a94..1cb0b9fc78dc 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,6 +8,7 @@
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h>
11 12
12#include <asm/apic.h> 13#include <asm/apic.h>
13#include <asm/io_apic.h> 14#include <asm/io_apic.h>
@@ -44,9 +45,9 @@ void ack_bad_irq(unsigned int irq)
44 45
45#define irq_stats(x) (&per_cpu(irq_stat, x)) 46#define irq_stats(x) (&per_cpu(irq_stat, x))
46/* 47/*
47 * /proc/interrupts printing: 48 * /proc/interrupts printing for arch specific interrupts
48 */ 49 */
49static int show_other_interrupts(struct seq_file *p, int prec) 50int arch_show_interrupts(struct seq_file *p, int prec)
50{ 51{
51 int j; 52 int j;
52 53
@@ -122,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
122 return 0; 123 return 0;
123} 124}
124 125
125int show_interrupts(struct seq_file *p, void *v)
126{
127 unsigned long flags, any_count = 0;
128 int i = *(loff_t *) v, j, prec;
129 struct irqaction *action;
130 struct irq_desc *desc;
131
132 if (i > nr_irqs)
133 return 0;
134
135 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
136 j *= 10;
137
138 if (i == nr_irqs)
139 return show_other_interrupts(p, prec);
140
141 /* print header */
142 if (i == 0) {
143 seq_printf(p, "%*s", prec + 8, "");
144 for_each_online_cpu(j)
145 seq_printf(p, "CPU%-8d", j);
146 seq_putc(p, '\n');
147 }
148
149 desc = irq_to_desc(i);
150 if (!desc)
151 return 0;
152
153 raw_spin_lock_irqsave(&desc->lock, flags);
154 for_each_online_cpu(j)
155 any_count |= kstat_irqs_cpu(i, j);
156 action = desc->action;
157 if (!action && !any_count)
158 goto out;
159
160 seq_printf(p, "%*d: ", prec, i);
161 for_each_online_cpu(j)
162 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
163 seq_printf(p, " %8s", desc->irq_data.chip->name);
164 seq_printf(p, "-%-8s", desc->name);
165
166 if (action) {
167 seq_printf(p, " %s", action->name);
168 while ((action = action->next) != NULL)
169 seq_printf(p, ", %s", action->name);
170 }
171
172 seq_putc(p, '\n');
173out:
174 raw_spin_unlock_irqrestore(&desc->lock, flags);
175 return 0;
176}
177
178/* 126/*
179 * /proc/stat helpers 127 * /proc/stat helpers
180 */ 128 */
@@ -276,15 +224,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
276 224
277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 225EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
278 226
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
288#ifdef CONFIG_HOTPLUG_CPU 227#ifdef CONFIG_HOTPLUG_CPU
289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 228/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
290void fixup_irqs(void) 229void fixup_irqs(void)
@@ -293,6 +232,7 @@ void fixup_irqs(void)
293 static int warned; 232 static int warned;
294 struct irq_desc *desc; 233 struct irq_desc *desc;
295 struct irq_data *data; 234 struct irq_data *data;
235 struct irq_chip *chip;
296 236
297 for_each_irq_desc(irq, desc) { 237 for_each_irq_desc(irq, desc) {
298 int break_affinity = 0; 238 int break_affinity = 0;
@@ -307,10 +247,10 @@ void fixup_irqs(void)
307 /* interrupt's are disabled at this point */ 247 /* interrupt's are disabled at this point */
308 raw_spin_lock(&desc->lock); 248 raw_spin_lock(&desc->lock);
309 249
310 data = &desc->irq_data; 250 data = irq_desc_get_irq_data(desc);
311 affinity = data->affinity; 251 affinity = data->affinity;
312 if (!irq_has_action(irq) || 252 if (!irq_has_action(irq) ||
313 cpumask_equal(affinity, cpu_online_mask)) { 253 cpumask_subset(affinity, cpu_online_mask)) {
314 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
315 continue; 255 continue;
316 } 256 }
@@ -327,16 +267,17 @@ void fixup_irqs(void)
327 affinity = cpu_all_mask; 267 affinity = cpu_all_mask;
328 } 268 }
329 269
330 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) 270 chip = irq_data_get_irq_chip(data);
331 data->chip->irq_mask(data); 271 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
272 chip->irq_mask(data);
332 273
333 if (data->chip->irq_set_affinity) 274 if (chip->irq_set_affinity)
334 data->chip->irq_set_affinity(data, affinity, true); 275 chip->irq_set_affinity(data, affinity, true);
335 else if (!(warned++)) 276 else if (!(warned++))
336 set_affinity = 0; 277 set_affinity = 0;
337 278
338 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) 279 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
339 data->chip->irq_unmask(data); 280 chip->irq_unmask(data);
340 281
341 raw_spin_unlock(&desc->lock); 282 raw_spin_unlock(&desc->lock);
342 283
@@ -367,10 +308,12 @@ void fixup_irqs(void)
367 if (irr & (1 << (vector % 32))) { 308 if (irr & (1 << (vector % 32))) {
368 irq = __this_cpu_read(vector_irq[vector]); 309 irq = __this_cpu_read(vector_irq[vector]);
369 310
370 data = irq_get_irq_data(irq); 311 desc = irq_to_desc(irq);
312 data = irq_desc_get_irq_data(desc);
313 chip = irq_data_get_irq_chip(data);
371 raw_spin_lock(&desc->lock); 314 raw_spin_lock(&desc->lock);
372 if (data->chip->irq_retrigger) 315 if (chip->irq_retrigger)
373 data->chip->irq_retrigger(data); 316 chip->irq_retrigger(data);
374 raw_spin_unlock(&desc->lock); 317 raw_spin_unlock(&desc->lock);
375 } 318 }
376 } 319 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 48ff6dcffa02..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREAD_FLAGS, 130 THREAD_FLAGS,
131 THREAD_ORDER)); 131 THREAD_ORDER));
132 irqctx->tinfo.task = NULL; 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
140 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
141 THREAD_FLAGS, 140 THREAD_FLAGS,
142 THREAD_ORDER)); 141 THREAD_ORDER));
143 irqctx->tinfo.task = NULL; 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.exec_domain = NULL;
145 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
146 irqctx->tinfo.preempt_count = 0;
147 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
148 145
149 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
@@ -175,7 +172,7 @@ asmlinkage void do_softirq(void)
175 172
176 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
177 /* 174 /*
178 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
179 */ 176 */
180 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
181 } 178 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e973958d..f470e4ef993e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -110,7 +113,7 @@ void __init init_ISA_irqs(void)
110 legacy_pic->init(0); 113 legacy_pic->init(0);
111 114
112 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
113 set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
114} 117}
115 118
116void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -118,6 +121,12 @@ void __init init_IRQ(void)
118 int i; 121 int i;
119 122
120 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
121 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
122 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
123 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -164,14 +173,77 @@ static void __init smp_intr_init(void)
164 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
165 174
166 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
167 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
168 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
169 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
170 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
171 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
172 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
173 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
174 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
175 247
176 /* IPI for generic function call */ 248 /* IPI for generic function call */
177 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -243,7 +315,7 @@ void __init native_init_IRQ(void)
243 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 315 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
244 } 316 }
245 317
246 if (!acpi_ioapic) 318 if (!acpi_ioapic && !of_ioapic)
247 setup_irq(2, &irq2); 319 setup_irq(2, &irq2);
248 320
249#ifdef CONFIG_X86_32 321#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a4130005028a..5f9ecff328b5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, 121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
122 dbg_reg_def[regno].size); 122 dbg_reg_def[regno].size);
123 123
124 switch (regno) {
125#ifdef CONFIG_X86_32 124#ifdef CONFIG_X86_32
125 switch (regno) {
126 case GDB_SS: 126 case GDB_SS:
127 if (!user_mode_vm(regs)) 127 if (!user_mode_vm(regs))
128 *(unsigned long *)mem = __KERNEL_DS; 128 *(unsigned long *)mem = __KERNEL_DS;
@@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
135 case GDB_FS: 135 case GDB_FS:
136 *(unsigned long *)mem = 0xFFFF; 136 *(unsigned long *)mem = 0xFFFF;
137 break; 137 break;
138#endif
139 } 138 }
139#endif
140 return dbg_reg_def[regno].name; 140 return dbg_reg_def[regno].name;
141} 141}
142 142
@@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno)
278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
279 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
280 /* 280 /*
281 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
282 * remove failure. 282 * remove failure.
283 */ 283 */
284 return -1; 284 return -1;
@@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
533 } 533 }
534 return NOTIFY_DONE; 534 return NOTIFY_DONE;
535 535
536 case DIE_NMIWATCHDOG:
537 if (atomic_read(&kgdb_active) != -1) {
538 /* KGDB CPU roundup: */
539 kgdb_nmicallback(raw_smp_processor_id(), regs);
540 return NOTIFY_STOP;
541 }
542 /* Enter debugger: */
543 break;
544
545 case DIE_DEBUG: 536 case DIE_DEBUG:
546 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
547 if (user_mode(regs)) 538 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477b3f62..c969fd9d1566 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 1276 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1277 return 0; 1277 return 0;
1278 1278
1279 /*
1280 * Do not optimize in the entry code due to the unstable
1281 * stack handling.
1282 */
1283 if ((paddr >= (unsigned long )__entry_text_start) &&
1284 (paddr < (unsigned long )__entry_text_end))
1285 return 0;
1286
1279 /* Check there is enough space for a relative jump. */ 1287 /* Check there is enough space for a relative jump. */
1280 if (size - offset < RELATIVEJUMP_SIZE) 1288 if (size - offset < RELATIVEJUMP_SIZE)
1281 return 0; 1289 return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8dc44662394b..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
493 native_smp_prepare_boot_cpu(); 493 native_smp_prepare_boot_cpu();
494} 494}
495 495
496static void kvm_guest_cpu_online(void *dummy) 496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{ 497{
498 kvm_guest_cpu_init(); 498 kvm_guest_cpu_init();
499} 499}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 0fe6d1a66c38..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static void * 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
160{ 158{
161 unsigned int total_size; 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 160 unsigned int max_size, actual_size;
163 void *mc; 161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
164 177
165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); 178 actual_size = buf[4] + (buf[5] << 8);
166 179
167 if (section_hdr[0] != UCODE_UCODE_TYPE) { 180 if (actual_size > size || actual_size > max_size) {
168 pr_err("error: invalid type field in container file section header\n"); 181 pr_err("section size mismatch\n");
169 return NULL; 182 return 0;
170 } 183 }
171 184
172 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 185 return actual_size;
186}
173 187
174 if (total_size > size || total_size > UCODE_MAX_SIZE) { 188static struct microcode_header_amd *
175 pr_err("error: size mismatch\n"); 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
176 return NULL; 190{
191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0;
193
194 if (buf[0] != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n");
196 goto out;
177 } 197 }
178 198
179 mc = vzalloc(UCODE_MAX_SIZE); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
202
203 mc = vzalloc(actual_size);
180 if (!mc) 204 if (!mc)
181 return NULL; 205 goto out;
182 206
183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
185 209
210out:
186 return mc; 211 return mc;
187} 212}
188 213
189static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
190{ 215{
191 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
192 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
193 unsigned long size; 218 unsigned int size = ibuf[2];
194 219
195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
196 221 pr_err("empty section/"
197 size = buf_pos[2]; 222 "invalid type field in container file section header\n");
198 223 return -EINVAL;
199 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
200 pr_err("error: invalid type field in container file section header\n");
201 return 0;
202 } 224 }
203 225
204 equiv_cpu_table = vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
205 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
206 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
207 return 0; 229 return -ENOMEM;
208 } 230 }
209 231
210 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
211 get_ucode_data(equiv_cpu_table, buf, size);
212 233
213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
214} 235}
@@ -223,16 +244,16 @@ static enum ucode_state
223generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
224{ 245{
225 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
226 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
227 void *new_mc = NULL; 251 void *new_mc = NULL;
228 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
229 int new_rev = uci->cpu_sig.rev;
230 unsigned int leftover;
231 unsigned long offset;
232 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
233 254
234 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
235 if (!offset) { 256 if (offset < 0) {
236 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
237 return UCODE_ERROR; 258 return UCODE_ERROR;
238 } 259 }
@@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
241 leftover = size - offset; 262 leftover = size - offset;
242 263
243 while (leftover) { 264 while (leftover) {
244 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
245 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
246
247 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
248 if (!mc)
249 break; 267 break;
250 268
251 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
252 if (get_matching_microcode(cpu, mc, new_rev)) {
253 vfree(new_mc); 270 vfree(new_mc);
254 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
255 new_mc = mc; 272 new_mc = mc_hdr;
256 } else 273 } else
257 vfree(mc); 274 vfree(mc_hdr);
258 275
259 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
260 leftover -= mc_size; 277 leftover -= mc_size;
261 } 278 }
262 279
263 if (new_mc) { 280 if (!new_mc) {
264 if (!leftover) {
265 vfree(uci->mc);
266 uci->mc = new_mc;
267 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
268 cpu, new_rev, uci->cpu_sig.rev);
269 } else {
270 vfree(new_mc);
271 state = UCODE_ERROR;
272 }
273 } else
274 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
275 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
276 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
277 297
278 return state; 298 return state;
279} 299}
280 300
281static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
282{ 302{
283 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
284 const struct firmware *firmware; 304 const struct firmware *fw;
285 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
286 306
287 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
288 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
289 return UCODE_NFOUND; 309 goto out;
290 } 310 }
291 311
292 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
293 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
294 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
295 return UCODE_ERROR; 315 goto fw_release;
296 } 316 }
297 317
298 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
299 319
300 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
301 322
323out:
302 return ret; 324 return ret;
303} 325}
304 326
@@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
319 341
320static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
321 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
322 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
323 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
324 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
325 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 1cca374a2bac..f9242800bc84 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -82,6 +82,7 @@
82#include <linux/cpu.h> 82#include <linux/cpu.h>
83#include <linux/fs.h> 83#include <linux/fs.h>
84#include <linux/mm.h> 84#include <linux/mm.h>
85#include <linux/syscore_ops.h>
85 86
86#include <asm/microcode.h> 87#include <asm/microcode.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -417,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
417 if (err) 418 if (err)
418 return err; 419 return err;
419 420
420 if (microcode_init_cpu(cpu) == UCODE_ERROR) 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
421 err = -EINVAL; 422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
423 return -EINVAL;
424 }
422 425
423 return err; 426 return err;
424} 427}
@@ -436,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
436 return 0; 439 return 0;
437} 440}
438 441
439static int mc_sysdev_resume(struct sys_device *dev) 442static struct sysdev_driver mc_sysdev_driver = {
443 .add = mc_sysdev_add,
444 .remove = mc_sysdev_remove,
445};
446
447/**
448 * mc_bp_resume - Update boot CPU microcode during resume.
449 */
450static void mc_bp_resume(void)
440{ 451{
441 int cpu = dev->id; 452 int cpu = smp_processor_id();
442 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 453 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
443 454
444 if (!cpu_online(cpu))
445 return 0;
446
447 /*
448 * All non-bootup cpus are still disabled,
449 * so only CPU 0 will apply ucode here.
450 *
451 * Moreover, there can be no concurrent
452 * updates from any other places at this point.
453 */
454 WARN_ON(cpu != 0);
455
456 if (uci->valid && uci->mc) 455 if (uci->valid && uci->mc)
457 microcode_ops->apply_microcode(cpu); 456 microcode_ops->apply_microcode(cpu);
458
459 return 0;
460} 457}
461 458
462static struct sysdev_driver mc_sysdev_driver = { 459static struct syscore_ops mc_syscore_ops = {
463 .add = mc_sysdev_add, 460 .resume = mc_bp_resume,
464 .remove = mc_sysdev_remove,
465 .resume = mc_sysdev_resume,
466}; 461};
467 462
468static __cpuinit int 463static __cpuinit int
@@ -540,6 +535,7 @@ static int __init microcode_init(void)
540 if (error) 535 if (error)
541 return error; 536 return error;
542 537
538 register_syscore_ops(&mc_syscore_ops);
543 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
544 540
545 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
@@ -554,6 +550,7 @@ static void __exit microcode_exit(void)
554 microcode_dev_exit(); 550 microcode_dev_exit();
555 551
556 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
553 unregister_syscore_ops(&mc_syscore_ops);
557 554
558 get_online_cpus(); 555 get_online_cpus();
559 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 01b0f6d06451..5a532ce646bf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
714 *nr_m_spare += 1; 714 *nr_m_spare += 1;
715 } 715 }
716} 716}
717#else /* CONFIG_X86_IO_APIC */
718static
719inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
720#endif /* CONFIG_X86_IO_APIC */
721 717
722static int 718static int
723check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
@@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
731 727
732 return ret; 728 return ret;
733} 729}
730#else /* CONFIG_X86_IO_APIC */
731static
732inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
733#endif /* CONFIG_X86_IO_APIC */
734 734
735static int __init replace_intsrc_all(struct mpc_table *mpc, 735static int __init replace_intsrc_all(struct mpc_table *mpc,
736 unsigned long mpc_new_phys, 736 unsigned long mpc_new_phys,
@@ -883,7 +883,7 @@ static int __init update_mp_table(void)
883 883
884 if (!mpc_new_phys) { 884 if (!mpc_new_phys) {
885 unsigned char old, new; 885 unsigned char old, new;
886 /* check if we can change the postion */ 886 /* check if we can change the position */
887 mpc->checksum = 0; 887 mpc->checksum = 0;
888 old = mpf_checksum((unsigned char *)mpc, mpc->length); 888 old = mpf_checksum((unsigned char *)mpc, mpc->length);
889 mpc->checksum = 0xff; 889 mpc->checksum = 0xff;
@@ -892,7 +892,7 @@ static int __init update_mp_table(void)
892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 892 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
893 return 0; 893 return 0;
894 } 894 }
895 printk(KERN_INFO "use in-positon replacing\n"); 895 printk(KERN_INFO "use in-position replacing\n");
896 } else { 896 } else {
897 mpf->physptr = mpc_new_phys; 897 mpf->physptr = mpc_new_phys;
898 mpc_new = phys_to_virt(mpc_new_phys); 898 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index f56a117cef68..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1279 1279
1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1281 /* 1281 /*
1282 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1283 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1284 */ 1284 */
1285 return 1; 1285 return 1;
@@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1295 1295
1296/* 1296/*
1297 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1298 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1299 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1300 */ 1300 */
1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c01ffa5b9b87..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,7 +27,7 @@
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/syscore_ops.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <asm/atomic.h> 33#include <asm/atomic.h>
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
81#define AGPEXTERN 81#define AGPEXTERN
82#endif 82#endif
83 83
84/* GART can only remap to physical addresses < 1TB */
85#define GART_MAX_PHYS_ADDR (1ULL << 40)
86
84/* backdoor interface to AGP driver */ 87/* backdoor interface to AGP driver */
85AGPEXTERN int agp_memory_reserved; 88AGPEXTERN int agp_memory_reserved;
86AGPEXTERN __u32 *agp_gatt_table; 89AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
212 size_t size, int dir, unsigned long align_mask) 215 size_t size, int dir, unsigned long align_mask)
213{ 216{
214 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); 217 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
215 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 218 unsigned long iommu_page;
216 int i; 219 int i;
217 220
221 if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
222 return bad_dma_addr;
223
224 iommu_page = alloc_iommu(dev, npages, align_mask);
218 if (iommu_page == -1) { 225 if (iommu_page == -1) {
219 if (!nonforced_iommu(dev, phys_mem, size)) 226 if (!nonforced_iommu(dev, phys_mem, size))
220 return phys_mem; 227 return phys_mem;
@@ -589,7 +596,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
589 aperture_alloc = aper_alloc; 596 aperture_alloc = aper_alloc;
590} 597}
591 598
592static void gart_fixup_northbridges(struct sys_device *dev) 599static void gart_fixup_northbridges(void)
593{ 600{
594 int i; 601 int i;
595 602
@@ -613,33 +620,20 @@ static void gart_fixup_northbridges(struct sys_device *dev)
613 } 620 }
614} 621}
615 622
616static int gart_resume(struct sys_device *dev) 623static void gart_resume(void)
617{ 624{
618 pr_info("PCI-DMA: Resuming GART IOMMU\n"); 625 pr_info("PCI-DMA: Resuming GART IOMMU\n");
619 626
620 gart_fixup_northbridges(dev); 627 gart_fixup_northbridges();
621 628
622 enable_gart_translations(); 629 enable_gart_translations();
623
624 return 0;
625} 630}
626 631
627static int gart_suspend(struct sys_device *dev, pm_message_t state) 632static struct syscore_ops gart_syscore_ops = {
628{
629 return 0;
630}
631
632static struct sysdev_class gart_sysdev_class = {
633 .name = "gart",
634 .suspend = gart_suspend,
635 .resume = gart_resume, 633 .resume = gart_resume,
636 634
637}; 635};
638 636
639static struct sys_device device_gart = {
640 .cls = &gart_sysdev_class,
641};
642
643/* 637/*
644 * Private Northbridge GATT initialization in case we cannot use the 638 * Private Northbridge GATT initialization in case we cannot use the
645 * AGP driver for some reason. 639 * AGP driver for some reason.
@@ -650,7 +644,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
650 unsigned aper_base, new_aper_base; 644 unsigned aper_base, new_aper_base;
651 struct pci_dev *dev; 645 struct pci_dev *dev;
652 void *gatt; 646 void *gatt;
653 int i, error; 647 int i;
654 648
655 pr_info("PCI-DMA: Disabling AGP.\n"); 649 pr_info("PCI-DMA: Disabling AGP.\n");
656 650
@@ -685,12 +679,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
685 679
686 agp_gatt_table = gatt; 680 agp_gatt_table = gatt;
687 681
688 error = sysdev_class_register(&gart_sysdev_class); 682 register_syscore_ops(&gart_syscore_ops);
689 if (!error)
690 error = sysdev_register(&device_gart);
691 if (error)
692 panic("Could not register gart_sysdev -- "
693 "would corrupt data on next suspend");
694 683
695 flush_gart(); 684 flush_gart();
696 685
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d8286ed54ffa..d46cbe46b7ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h>
17#include <asm/system.h> 18#include <asm/system.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
19#include <asm/syscalls.h> 20#include <asm/syscalls.h>
@@ -86,26 +87,33 @@ void exit_thread(void)
86void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
87{ 88{
88 show_registers(regs); 89 show_registers(regs);
89 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
90} 91}
91 92
92void show_regs_common(void) 93void show_regs_common(void)
93{ 94{
94 const char *board, *product; 95 const char *vendor, *product, *board;
95 96
96 board = dmi_get_system_info(DMI_BOARD_NAME); 97 vendor = dmi_get_system_info(DMI_SYS_VENDOR);
97 if (!board) 98 if (!vendor)
98 board = ""; 99 vendor = "";
99 product = dmi_get_system_info(DMI_PRODUCT_NAME); 100 product = dmi_get_system_info(DMI_PRODUCT_NAME);
100 if (!product) 101 if (!product)
101 product = ""; 102 product = "";
102 103
104 /* Board Name is optional */
105 board = dmi_get_system_info(DMI_BOARD_NAME);
106
103 printk(KERN_CONT "\n"); 107 printk(KERN_CONT "\n");
104 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", 108 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
105 current->pid, current->comm, print_tainted(), 109 current->pid, current->comm, print_tainted(),
106 init_utsname()->release, 110 init_utsname()->release,
107 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
108 init_utsname()->version, board, product); 112 init_utsname()->version);
113 printk(KERN_CONT " %s %s", vendor, product);
114 if (board)
115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "\n");
109} 117}
110 118
111void flush_thread(void) 119void flush_thread(void)
@@ -505,7 +513,7 @@ static void poll_idle(void)
505#define MWAIT_ECX_EXTENDED_INFO 0x01 513#define MWAIT_ECX_EXTENDED_INFO 0x01
506#define MWAIT_EDX_C1 0xf0 514#define MWAIT_EDX_C1 0xf0
507 515
508static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 516int mwait_usable(const struct cpuinfo_x86 *c)
509{ 517{
510 u32 eax, ebx, ecx, edx; 518 u32 eax, ebx, ecx, edx;
511 519
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b4..6c9dd922ac0d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -501,6 +501,10 @@ void set_personality_64bit(void)
501 /* Make sure to be in 64bit mode */ 501 /* Make sure to be in 64bit mode */
502 clear_thread_flag(TIF_IA32); 502 clear_thread_flag(TIF_IA32);
503 503
504 /* Ensure the corresponding mm is not marked. */
505 if (current->mm)
506 current->mm->context.ia32_compat = 0;
507
504 /* TBD: overwrites user setup. Should have two bits. 508 /* TBD: overwrites user setup. Should have two bits.
505 But 64bit processes have always behaved this way, 509 But 64bit processes have always behaved this way,
506 so it's not too bad. The main problem is just that 510 so it's not too bad. The main problem is just that
@@ -516,6 +520,10 @@ void set_personality_ia32(void)
516 set_thread_flag(TIF_IA32); 520 set_thread_flag(TIF_IA32);
517 current->personality |= force_personality32; 521 current->personality |= force_personality32;
518 522
523 /* Mark the associated mm as containing 32-bit tasks. */
524 if (current->mm)
525 current->mm->context.ia32_compat = 1;
526
519 /* Prepare the first "return" to user space */ 527 /* Prepare the first "return" to user space */
520 current_thread_info()->status |= TS_COMPAT; 528 current_thread_info()->status |= TS_COMPAT;
521} 529}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index fc7aae1e2bc7..08c44b08bf5b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/tboot.h> 8#include <linux/tboot.h>
9#include <linux/delay.h>
9#include <acpi/reboot.h> 10#include <acpi/reboot.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
@@ -285,6 +286,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
285 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 286 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
286 }, 287 },
287 }, 288 },
289 { /* Handle problems with rebooting on VersaLogic Menlow boards */
290 .callback = set_bios_reboot,
291 .ident = "VersaLogic Menlow based board",
292 .matches = {
293 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 },
296 },
288 { } 297 { }
289}; 298};
290 299
@@ -295,68 +304,16 @@ static int __init reboot_init(void)
295} 304}
296core_initcall(reboot_init); 305core_initcall(reboot_init);
297 306
298/* The following code and data reboots the machine by switching to real 307extern const unsigned char machine_real_restart_asm[];
299 mode and jumping to the BIOS reset entry point, as if the CPU has 308extern const u64 machine_real_restart_gdt[3];
300 really been reset. The previous version asked the keyboard
301 controller to pulse the CPU reset line, which is more thorough, but
302 doesn't work with at least one type of 486 motherboard. It is easy
303 to stop this code working; hence the copious comments. */
304static const unsigned long long
305real_mode_gdt_entries [3] =
306{
307 0x0000000000000000ULL, /* Null descriptor */
308 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
309 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
310};
311 309
312static const struct desc_ptr 310void machine_real_restart(unsigned int type)
313real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
314real_mode_idt = { 0x3ff, 0 };
315
316/* This is 16-bit protected mode code to disable paging and the cache,
317 switch to real mode and jump to the BIOS reset code.
318
319 The instruction that switches to real mode by writing to CR0 must be
320 followed immediately by a far jump instruction, which set CS to a
321 valid value for real mode, and flushes the prefetch queue to avoid
322 running instructions that have already been decoded in protected
323 mode.
324
325 Clears all the flags except ET, especially PG (paging), PE
326 (protected-mode enable) and TS (task switch for coprocessor state
327 save). Flushes the TLB after paging has been disabled. Sets CD and
328 NW, to disable the cache on a 486, and invalidates the cache. This
329 is more like the state of a 486 after reset. I don't know if
330 something else should be done for other chips.
331
332 More could be done here to set up the registers as if a CPU reset had
333 occurred; hopefully real BIOSs don't assume much. */
334static const unsigned char real_mode_switch [] =
335{
336 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
337 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
338 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
339 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
340 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
341 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
342 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
343 0x74, 0x02, /* jz f */
344 0x0f, 0x09, /* wbinvd */
345 0x24, 0x10, /* f: andb $0x10,al */
346 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
347};
348static const unsigned char jump_to_bios [] =
349{ 311{
350 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 312 void *restart_va;
351}; 313 unsigned long restart_pa;
314 void (*restart_lowmem)(unsigned int);
315 u64 *lowmem_gdt;
352 316
353/*
354 * Switch to real mode and then execute the code
355 * specified by the code and length parameters.
356 * We assume that length will aways be less that 100!
357 */
358void machine_real_restart(const unsigned char *code, int length)
359{
360 local_irq_disable(); 317 local_irq_disable();
361 318
362 /* Write zero to CMOS register number 0x0f, which the BIOS POST 319 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -384,41 +341,23 @@ void machine_real_restart(const unsigned char *code, int length)
384 too. */ 341 too. */
385 *((unsigned short *)0x472) = reboot_mode; 342 *((unsigned short *)0x472) = reboot_mode;
386 343
387 /* For the switch to real mode, copy some code to low memory. It has 344 /* Patch the GDT in the low memory trampoline */
388 to be in the first 64k because it is running in 16-bit mode, and it 345 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
389 has to have the same physical and virtual address, because it turns 346
390 off paging. Copy it near the end of the first page, out of the way 347 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
391 of BIOS variables. */ 348 restart_pa = virt_to_phys(restart_va);
392 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 349 restart_lowmem = (void (*)(unsigned int))restart_pa;
393 real_mode_switch, sizeof (real_mode_switch)); 350
394 memcpy((void *)(0x1000 - 100), code, length); 351 /* GDT[0]: GDT self-pointer */
395 352 lowmem_gdt[0] =
396 /* Set up the IDT for real mode. */ 353 (u64)(sizeof(machine_real_restart_gdt) - 1) +
397 load_idt(&real_mode_idt); 354 ((u64)virt_to_phys(lowmem_gdt) << 16);
398 355 /* GDT[1]: 64K real mode code segment */
399 /* Set up a GDT from which we can load segment descriptors for real 356 lowmem_gdt[1] =
400 mode. The GDT is not used in real mode; it is just needed here to 357 GDT_ENTRY(0x009b, restart_pa, 0xffff);
401 prepare the descriptors. */ 358
402 load_gdt(&real_mode_gdt); 359 /* Jump to the identity-mapped low memory code */
403 360 restart_lowmem(type);
404 /* Load the data segment registers, and thus the descriptors ready for
405 real mode. The base address of each segment is 0x100, 16 times the
406 selector value being loaded here. This is so that the segment
407 registers don't have to be reloaded after switching to real mode:
408 the values are consistent for real mode operation already. */
409 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
410 "\tmovl %%eax,%%ds\n"
411 "\tmovl %%eax,%%es\n"
412 "\tmovl %%eax,%%fs\n"
413 "\tmovl %%eax,%%gs\n"
414 "\tmovl %%eax,%%ss" : : : "eax");
415
416 /* Jump to the 16-bit code that we copied earlier. It disables paging
417 and the cache, switches to real mode, and jumps to the BIOS reset
418 entry point. */
419 __asm__ __volatile__ ("ljmp $0x0008,%0"
420 :
421 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
422} 361}
423#ifdef CONFIG_APM_MODULE 362#ifdef CONFIG_APM_MODULE
424EXPORT_SYMBOL(machine_real_restart); 363EXPORT_SYMBOL(machine_real_restart);
@@ -573,7 +512,7 @@ static void native_machine_emergency_restart(void)
573 512
574#ifdef CONFIG_X86_32 513#ifdef CONFIG_X86_32
575 case BOOT_BIOS: 514 case BOOT_BIOS:
576 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 515 machine_real_restart(MRR_BIOS);
577 516
578 reboot_type = BOOT_KBD; 517 reboot_type = BOOT_KBD;
579 break; 518 break;
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..29092b38d816
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $1b, %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw dispatch_table(%ebx,%eax,2),%ax
32 movw %ax, 101f(%ebx)
33 movw %cx, 102f(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl machine_real_restart_idt(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl machine_real_restart_gdt(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 6f39cab052d5..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3cfe26c0252..4be9b398470e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h> 115#include <asm/alternative.h>
116#include <asm/prom.h>
116 117
117/* 118/*
118 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -297,6 +298,9 @@ static void __init init_gbpages(void)
297static inline void init_gbpages(void) 298static inline void init_gbpages(void)
298{ 299{
299} 300}
301static void __init cleanup_highmap(void)
302{
303}
300#endif 304#endif
301 305
302static void __init reserve_brk(void) 306static void __init reserve_brk(void)
@@ -429,16 +433,30 @@ static void __init parse_setup_data(void)
429 return; 433 return;
430 pa_data = boot_params.hdr.setup_data; 434 pa_data = boot_params.hdr.setup_data;
431 while (pa_data) { 435 while (pa_data) {
432 data = early_memremap(pa_data, PAGE_SIZE); 436 u32 data_len, map_len;
437
438 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
439 (u64)sizeof(struct setup_data));
440 data = early_memremap(pa_data, map_len);
441 data_len = data->len + sizeof(struct setup_data);
442 if (data_len > map_len) {
443 early_iounmap(data, map_len);
444 data = early_memremap(pa_data, data_len);
445 map_len = data_len;
446 }
447
433 switch (data->type) { 448 switch (data->type) {
434 case SETUP_E820_EXT: 449 case SETUP_E820_EXT:
435 parse_e820_ext(data, pa_data); 450 parse_e820_ext(data);
451 break;
452 case SETUP_DTB:
453 add_dtb(pa_data);
436 break; 454 break;
437 default: 455 default:
438 break; 456 break;
439 } 457 }
440 pa_data = data->next; 458 pa_data = data->next;
441 early_iounmap(data, PAGE_SIZE); 459 early_iounmap(data, map_len);
442 } 460 }
443} 461}
444 462
@@ -601,28 +619,6 @@ void __init reserve_standard_io_resources(void)
601 619
602} 620}
603 621
604/*
605 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
606 * is_kdump_kernel() to determine if we are booting after a panic. Hence
607 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
608 */
609
610#ifdef CONFIG_CRASH_DUMP
611/* elfcorehdr= specifies the location of elf core header
612 * stored by the crashed kernel. This option will be passed
613 * by kexec loader to the capture kernel.
614 */
615static int __init setup_elfcorehdr(char *arg)
616{
617 char *end;
618 if (!arg)
619 return -EINVAL;
620 elfcorehdr_addr = memparse(arg, &end);
621 return end > arg ? 0 : -EINVAL;
622}
623early_param("elfcorehdr", setup_elfcorehdr);
624#endif
625
626static __init void reserve_ibft_region(void) 622static __init void reserve_ibft_region(void)
627{ 623{
628 unsigned long addr, size = 0; 624 unsigned long addr, size = 0;
@@ -680,15 +676,6 @@ static int __init parse_reservelow(char *p)
680 676
681early_param("reservelow", parse_reservelow); 677early_param("reservelow", parse_reservelow);
682 678
683static u64 __init get_max_mapped(void)
684{
685 u64 end = max_pfn_mapped;
686
687 end <<= PAGE_SHIFT;
688
689 return end;
690}
691
692/* 679/*
693 * Determine if we were loaded by an EFI loader. If so, then we have also been 680 * Determine if we were loaded by an EFI loader. If so, then we have also been
694 * passed the efi memmap, systab, etc., so we should use these data structures 681 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -704,8 +691,6 @@ static u64 __init get_max_mapped(void)
704 691
705void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
706{ 693{
707 int acpi = 0;
708 int amd = 0;
709 unsigned long flags; 694 unsigned long flags;
710 695
711#ifdef CONFIG_X86_32 696#ifdef CONFIG_X86_32
@@ -922,6 +907,8 @@ void __init setup_arch(char **cmdline_p)
922 */ 907 */
923 reserve_brk(); 908 reserve_brk();
924 909
910 cleanup_highmap();
911
925 memblock.current_limit = get_max_mapped(); 912 memblock.current_limit = get_max_mapped();
926 memblock_x86_fill(); 913 memblock_x86_fill();
927 914
@@ -935,15 +922,8 @@ void __init setup_arch(char **cmdline_p)
935 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 922 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
936 max_pfn_mapped<<PAGE_SHIFT); 923 max_pfn_mapped<<PAGE_SHIFT);
937 924
938 reserve_trampoline_memory(); 925 setup_trampolines();
939 926
940#ifdef CONFIG_ACPI_SLEEP
941 /*
942 * Reserve low memory region for sleep support.
943 * even before init_memory_mapping
944 */
945 acpi_reserve_wakeup_memory();
946#endif
947 init_gbpages(); 927 init_gbpages();
948 928
949 /* max_pfn_mapped is updated here */ 929 /* max_pfn_mapped is updated here */
@@ -984,19 +964,7 @@ void __init setup_arch(char **cmdline_p)
984 964
985 early_acpi_boot_init(); 965 early_acpi_boot_init();
986 966
987#ifdef CONFIG_ACPI_NUMA 967 initmem_init();
988 /*
989 * Parse SRAT to discover nodes.
990 */
991 acpi = acpi_numa_init();
992#endif
993
994#ifdef CONFIG_AMD_NUMA
995 if (!acpi)
996 amd = !amd_numa_init(0, max_pfn);
997#endif
998
999 initmem_init(0, max_pfn, acpi, amd);
1000 memblock_find_dma_reserve(); 968 memblock_find_dma_reserve();
1001 dma32_reserve_bootmem(); 969 dma32_reserve_bootmem();
1002 970
@@ -1008,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
1008 paging_init(); 976 paging_init();
1009 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 977 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1010 978
979 if (boot_cpu_data.cpuid_level >= 0) {
980 /* A CPU has %cr4 if and only if it has CPUID */
981 mmu_cr4_features = read_cr4();
982 }
983
1011#ifdef CONFIG_X86_32 984#ifdef CONFIG_X86_32
1012 /* sync back kernel address range */ 985 /* sync back kernel address range */
1013 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, 986 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
@@ -1029,8 +1002,8 @@ void __init setup_arch(char **cmdline_p)
1029 * Read APIC and some other early information from ACPI tables. 1002 * Read APIC and some other early information from ACPI tables.
1030 */ 1003 */
1031 acpi_boot_init(); 1004 acpi_boot_init();
1032
1033 sfi_init(); 1005 sfi_init();
1006 x86_dtb_init();
1034 1007
1035 /* 1008 /*
1036 * get boot-time SMP configuration: 1009 * get boot-time SMP configuration:
@@ -1040,9 +1013,7 @@ void __init setup_arch(char **cmdline_p)
1040 1013
1041 prefill_possible_map(); 1014 prefill_possible_map();
1042 1015
1043#ifdef CONFIG_X86_64
1044 init_cpu_to_node(); 1016 init_cpu_to_node();
1045#endif
1046 1017
1047 init_apic_mappings(); 1018 init_apic_mappings();
1048 ioapic_and_gsi_init(); 1019 ioapic_and_gsi_init();
@@ -1066,6 +1037,8 @@ void __init setup_arch(char **cmdline_p)
1066#endif 1037#endif
1067 x86_init.oem.banner(); 1038 x86_init.oem.banner();
1068 1039
1040 x86_init.timers.wallclock_init();
1041
1069 mcheck_init(); 1042 mcheck_init();
1070 1043
1071 local_irq_save(flags); 1044 local_irq_save(flags);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 002b79685f73..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void)
225 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
226 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
227#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
228#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
229 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
230 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
231 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
232#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
233 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
234 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void)
242 */ 247 */
243 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
244#endif 249#endif
245#endif
246 /* 250 /*
247 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
248 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
@@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void)
256 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
257 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
258#endif 262#endif
259#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
260 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
261#endif 268#endif
262 269
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763df77343dd..8ed8908cc9f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,6 +64,7 @@
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/mwait.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused)
355 cpu_idle(); 297 cpu_idle();
356} 298}
357 299
358#ifdef CONFIG_CPUMASK_OFFSTACK
359/* In this case, llc_shared_map is a pointer to a cpumask. */
360static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
361 const struct cpuinfo_x86 *src)
362{
363 struct cpumask *llc = dst->llc_shared_map;
364 *dst = *src;
365 dst->llc_shared_map = llc;
366}
367#else
368static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
369 const struct cpuinfo_x86 *src)
370{
371 *dst = *src;
372}
373#endif /* CONFIG_CPUMASK_OFFSTACK */
374
375/* 300/*
376 * The bootstrap kernel entry code has set these up. Save them for 301 * The bootstrap kernel entry code has set these up. Save them for
377 * a given CPU 302 * a given CPU
@@ -381,23 +306,41 @@ void __cpuinit smp_store_cpu_info(int id)
381{ 306{
382 struct cpuinfo_x86 *c = &cpu_data(id); 307 struct cpuinfo_x86 *c = &cpu_data(id);
383 308
384 copy_cpuinfo_x86(c, &boot_cpu_data); 309 *c = boot_cpu_data;
385 c->cpu_index = id; 310 c->cpu_index = id;
386 if (id != 0) 311 if (id != 0)
387 identify_secondary_cpu(c); 312 identify_secondary_cpu(c);
388} 313}
389 314
390static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 315static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
391{ 316{
392 struct cpuinfo_x86 *c1 = &cpu_data(cpu1); 317 int node1 = early_cpu_to_node(cpu1);
393 struct cpuinfo_x86 *c2 = &cpu_data(cpu2); 318 int node2 = early_cpu_to_node(cpu2);
319
320 /*
321 * Our CPU scheduler assumes all logical cpus in the same physical cpu
322 * share the same node. But, buggy ACPI or NUMA emulation might assign
323 * them to different node. Fix it.
324 */
325 if (node1 != node2) {
326 pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
327 cpu1, node1, cpu2, node2, node2);
328
329 numa_remove_cpu(cpu1);
330 numa_set_node(cpu1, node2);
331 numa_add_cpu(cpu1);
332 }
333}
394 334
335static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
336{
395 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 337 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
396 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 338 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
397 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 339 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
398 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 340 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
399 cpumask_set_cpu(cpu1, c2->llc_shared_map); 341 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
400 cpumask_set_cpu(cpu2, c1->llc_shared_map); 342 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
343 check_cpu_siblings_on_same_node(cpu1, cpu2);
401} 344}
402 345
403 346
@@ -414,6 +357,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
414 357
415 if (cpu_has(c, X86_FEATURE_TOPOEXT)) { 358 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
416 if (c->phys_proc_id == o->phys_proc_id && 359 if (c->phys_proc_id == o->phys_proc_id &&
360 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
417 c->compute_unit_id == o->compute_unit_id) 361 c->compute_unit_id == o->compute_unit_id)
418 link_thread_siblings(cpu, i); 362 link_thread_siblings(cpu, i);
419 } else if (c->phys_proc_id == o->phys_proc_id && 363 } else if (c->phys_proc_id == o->phys_proc_id &&
@@ -425,7 +369,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
425 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 369 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
426 } 370 }
427 371
428 cpumask_set_cpu(cpu, c->llc_shared_map); 372 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
429 373
430 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { 374 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 375 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -436,12 +380,14 @@ void __cpuinit set_cpu_sibling_map(int cpu)
436 for_each_cpu(i, cpu_sibling_setup_mask) { 380 for_each_cpu(i, cpu_sibling_setup_mask) {
437 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 381 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
438 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 382 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
439 cpumask_set_cpu(i, c->llc_shared_map); 383 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
440 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 384 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
385 check_cpu_siblings_on_same_node(cpu, i);
441 } 386 }
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 387 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
443 cpumask_set_cpu(i, cpu_core_mask(cpu)); 388 cpumask_set_cpu(i, cpu_core_mask(cpu));
444 cpumask_set_cpu(cpu, cpu_core_mask(i)); 389 cpumask_set_cpu(cpu, cpu_core_mask(i));
390 check_cpu_siblings_on_same_node(cpu, i);
445 /* 391 /*
446 * Does this new cpu bringup a new core? 392 * Does this new cpu bringup a new core?
447 */ 393 */
@@ -476,7 +422,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
476 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 422 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
477 return cpu_core_mask(cpu); 423 return cpu_core_mask(cpu);
478 else 424 else
479 return c->llc_shared_map; 425 return cpu_llc_shared_mask(cpu);
480} 426}
481 427
482static void impress_friends(void) 428static void impress_friends(void)
@@ -638,7 +584,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
638 * target processor state. 584 * target processor state.
639 */ 585 */
640 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 586 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
641 (unsigned long)stack_start.sp); 587 stack_start);
642 588
643 /* 589 /*
644 * Run STARTUP IPI loop. 590 * Run STARTUP IPI loop.
@@ -785,10 +731,10 @@ do_rest:
785#endif 731#endif
786 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 732 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
787 initial_code = (unsigned long)start_secondary; 733 initial_code = (unsigned long)start_secondary;
788 stack_start.sp = (void *) c_idle.idle->thread.sp; 734 stack_start = c_idle.idle->thread.sp;
789 735
790 /* start_ip had better be page-aligned! */ 736 /* start_ip had better be page-aligned! */
791 start_ip = setup_trampoline(); 737 start_ip = trampoline_address();
792 738
793 /* So we see what's up */ 739 /* So we see what's up */
794 announce_cpu(cpu, apicid); 740 announce_cpu(cpu, apicid);
@@ -798,6 +744,8 @@ do_rest:
798 * the targeted processor. 744 * the targeted processor.
799 */ 745 */
800 746
747 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
748
801 atomic_set(&init_deasserted, 0); 749 atomic_set(&init_deasserted, 0);
802 750
803 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 751 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -851,8 +799,8 @@ do_rest:
851 pr_debug("CPU%d: has booted.\n", cpu); 799 pr_debug("CPU%d: has booted.\n", cpu);
852 else { 800 else {
853 boot_error = 1; 801 boot_error = 1;
854 if (*((volatile unsigned char *)trampoline_base) 802 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
855 == 0xA5) 803 == 0xA5A5A5A5)
856 /* trampoline started but...? */ 804 /* trampoline started but...? */
857 pr_err("CPU%d: Stuck ??\n", cpu); 805 pr_err("CPU%d: Stuck ??\n", cpu);
858 else 806 else
@@ -878,7 +826,7 @@ do_rest:
878 } 826 }
879 827
880 /* mark "stuck" area as not stuck */ 828 /* mark "stuck" area as not stuck */
881 *((volatile unsigned long *)trampoline_base) = 0; 829 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
882 830
883 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 831 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
884 /* 832 /*
@@ -945,6 +893,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
945 return 0; 893 return 0;
946} 894}
947 895
896/**
897 * arch_disable_smp_support() - disables SMP support for x86 at runtime
898 */
899void arch_disable_smp_support(void)
900{
901 disable_ioapic_support();
902}
903
948/* 904/*
949 * Fall back to non SMP mode after errors. 905 * Fall back to non SMP mode after errors.
950 * 906 *
@@ -960,7 +916,6 @@ static __init void disable_smp(void)
960 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 916 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
961 else 917 else
962 physid_set_mask_of_physid(0, &phys_cpu_present_map); 918 physid_set_mask_of_physid(0, &phys_cpu_present_map);
963 map_cpu_to_logical_apicid();
964 cpumask_set_cpu(0, cpu_sibling_mask(0)); 919 cpumask_set_cpu(0, cpu_sibling_mask(0));
965 cpumask_set_cpu(0, cpu_core_mask(0)); 920 cpumask_set_cpu(0, cpu_core_mask(0));
966} 921}
@@ -1045,7 +1000,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1045 "(tell your hw vendor)\n"); 1000 "(tell your hw vendor)\n");
1046 } 1001 }
1047 smpboot_clear_io_apic(); 1002 smpboot_clear_io_apic();
1048 arch_disable_smp_support(); 1003 disable_ioapic_support();
1049 return -1; 1004 return -1;
1050 } 1005 }
1051 1006
@@ -1060,7 +1015,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1060 1015
1061 connect_bsp_APIC(); 1016 connect_bsp_APIC();
1062 setup_local_APIC(); 1017 setup_local_APIC();
1063 end_local_APIC_setup(); 1018 bsp_end_local_APIC_setup();
1064 return -1; 1019 return -1;
1065 } 1020 }
1066 1021
@@ -1089,21 +1044,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1089 1044
1090 preempt_disable(); 1045 preempt_disable();
1091 smp_cpu_index_default(); 1046 smp_cpu_index_default();
1092 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); 1047
1093 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1094 mb();
1095 /* 1048 /*
1096 * Setup boot CPU information 1049 * Setup boot CPU information
1097 */ 1050 */
1098 smp_store_cpu_info(0); /* Final full version of the data */ 1051 smp_store_cpu_info(0); /* Final full version of the data */
1099#ifdef CONFIG_X86_32 1052 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1100 boot_cpu_logical_apicid = logical_smp_processor_id(); 1053 mb();
1101#endif 1054
1102 current_thread_info()->cpu = 0; /* needed? */ 1055 current_thread_info()->cpu = 0; /* needed? */
1103 for_each_possible_cpu(i) { 1056 for_each_possible_cpu(i) {
1104 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1057 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1105 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1058 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1106 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1059 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1107 } 1060 }
1108 set_cpu_sibling_map(0); 1061 set_cpu_sibling_map(0);
1109 1062
@@ -1137,9 +1090,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1137 if (!skip_ioapic_setup && nr_ioapics) 1090 if (!skip_ioapic_setup && nr_ioapics)
1138 enable_IO_APIC(); 1091 enable_IO_APIC();
1139 1092
1140 end_local_APIC_setup(); 1093 bsp_end_local_APIC_setup();
1141
1142 map_cpu_to_logical_apicid();
1143 1094
1144 if (apic->setup_portio_remap) 1095 if (apic->setup_portio_remap)
1145 apic->setup_portio_remap(); 1096 apic->setup_portio_remap();
@@ -1402,8 +1353,9 @@ static inline void mwait_play_dead(void)
1402 unsigned int highest_subcstate = 0; 1353 unsigned int highest_subcstate = 0;
1403 int i; 1354 int i;
1404 void *mwait_ptr; 1355 void *mwait_ptr;
1356 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1405 1357
1406 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT)) 1358 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
1407 return; 1359 return;
1408 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) 1360 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
1409 return; 1361 return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 938c8e10a19a..6515733a289d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,7 +73,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
@@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, regs, NULL, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786dc9b8f..abce34d5c79d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,7 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a375616d77f7..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,39 +2,41 @@
2#include <linux/memblock.h> 2#include <linux/memblock.h>
3 3
4#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
5#include <asm/pgtable.h> 6#include <asm/pgtable.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 phys_addr_t mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == MEMBLOCK_ERROR) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base); 38
39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
40 return 0;
40} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 075d130efcf9..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -129,6 +128,7 @@ no_longmode:
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ffe5755caa8b..9335bf7dd2e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
427 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
428 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
429 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
430 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
431 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
432 * then we discard the result and have another try. 432 * then we discard the result and have another try.
433 * 433 *
@@ -900,7 +900,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
900 * timer based, instead of loop based, we don't block the boot 900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done. 901 * process while this longer calibration is done.
902 * 902 *
903 * If there are any calibration anomolies (too many SMIs, etc), 903 * If there are any calibration anomalies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early 904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the 905 * calibration, we throw out the new calibration and use the
906 * early calibration. 906 * early calibration.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 0edefc19a113..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -18,7 +18,7 @@
18 * This file is expected to run in 32bit code. Currently: 18 * This file is expected to run in 32bit code. Currently:
19 * 19 *
20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication 21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup 22 * arch/x86/kernel/head_32.S: processor startup
23 * 23 *
24 * verify_cpu, returns the status of longmode and SSE in register %eax. 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf4700755184..624a2016198e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -230,7 +231,7 @@ SECTIONS
230 * output PHDR, so the next output section - .init.text - should 231 * output PHDR, so the next output section - .init.text - should
231 * start another segment - init. 232 * start another segment - init.
232 */ 233 */
233 PERCPU_VADDR(0, :percpu) 234 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
234#endif 235#endif
235 236
236 INIT_TEXT_SECTION(PAGE_SIZE) 237 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -240,6 +241,18 @@ SECTIONS
240 241
241 INIT_DATA_SECTION(16) 242 INIT_DATA_SECTION(16)
242 243
244 /*
245 * Code and data for a variety of lowlevel trampolines, to be
246 * copied into base memory (< 1 MiB) during initialization.
247 * Since it is copied early, the main copy can be discarded
248 * afterwards.
249 */
250 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
251 x86_trampoline_start = .;
252 *(.x86_trampoline)
253 x86_trampoline_end = .;
254 }
255
243 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 256 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
244 __x86_cpu_dev_start = .; 257 __x86_cpu_dev_start = .;
245 *(.x86_cpu_dev.init) 258 *(.x86_cpu_dev.init)
@@ -291,6 +304,7 @@ SECTIONS
291 *(.iommu_table) 304 *(.iommu_table)
292 __iommu_table_end = .; 305 __iommu_table_end = .;
293 } 306 }
307
294 . = ALIGN(8); 308 . = ALIGN(8);
295 /* 309 /*
296 * .exit.text is discard at runtime, not link time, to deal with 310 * .exit.text is discard at runtime, not link time, to deal with
@@ -305,7 +319,7 @@ SECTIONS
305 } 319 }
306 320
307#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
308 PERCPU(THREAD_SIZE) 322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
309#endif 323#endif
310 324
311 . = ALIGN(PAGE_SIZE); 325 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ceb2911aa439..c11514e9128b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
70 .setup_percpu_clockev = setup_boot_APIC_clock, 70 .setup_percpu_clockev = setup_boot_APIC_clock,
71 .tsc_pre_init = x86_init_noop, 71 .tsc_pre_init = x86_init_noop,
72 .timer_init = hpet_time_init, 72 .timer_init = hpet_time_init,
73 .wallclock_init = x86_init_noop,
73 }, 74 },
74 75
75 .iommu = { 76 .iommu = {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 547128546cc3..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
78/* Misc flags */ 78/* Misc flags */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */
79#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
80#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
81#define Undefined (1<<25) /* No Such Instruction */ 82#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
877 if (selector & 1 << 2) { 878 if (selector & 1 << 2) {
878 struct desc_struct desc; 879 struct desc_struct desc;
879 memset (dt, 0, sizeof *dt); 880 memset (dt, 0, sizeof *dt);
880 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
882 ctxt->vcpu))
881 return; 883 return;
882 884
883 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
929 return ret; 931 return ret;
930} 932}
931 933
934/* Does not support long mode */
932static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 935static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
933 struct x86_emulate_ops *ops, 936 struct x86_emulate_ops *ops,
934 u16 selector, int seg) 937 u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1040 } 1043 }
1041load: 1044load:
1042 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1045 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1043 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1044 return X86EMUL_CONTINUE; 1047 return X86EMUL_CONTINUE;
1045exception: 1048exception:
1046 emulate_exception(ctxt, err_vec, err_code, true); 1049 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1560 struct desc_struct *ss) 1563 struct desc_struct *ss)
1561{ 1564{
1562 memset(cs, 0, sizeof(struct desc_struct)); 1565 memset(cs, 0, sizeof(struct desc_struct));
1563 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
1564 memset(ss, 0, sizeof(struct desc_struct)); 1567 memset(ss, 0, sizeof(struct desc_struct));
1565 1568
1566 cs->l = 0; /* will be adjusted later */ 1569 cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1607 cs.d = 0; 1610 cs.d = 0;
1608 cs.l = 1; 1611 cs.l = 1;
1609 } 1612 }
1610 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1611 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1612 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1613 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1614 1617
1615 c->regs[VCPU_REGS_RCX] = c->eip; 1618 c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1679 cs.l = 1; 1682 cs.l = 1;
1680 } 1683 }
1681 1684
1682 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1683 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1684 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1685 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1686 1689
1687 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1736 cs_sel |= SELECTOR_RPL_MASK; 1739 cs_sel |= SELECTOR_RPL_MASK;
1737 ss_sel |= SELECTOR_RPL_MASK; 1740 ss_sel |= SELECTOR_RPL_MASK;
1738 1741
1739 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1740 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1741 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1742 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1743 1746
1744 c->eip = c->regs[VCPU_REGS_RDX]; 1747 c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1764 u16 port, u16 len) 1767 u16 port, u16 len)
1765{ 1768{
1766 struct desc_struct tr_seg; 1769 struct desc_struct tr_seg;
1770 u32 base3;
1767 int r; 1771 int r;
1768 u16 io_bitmap_ptr; 1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
1769 u8 perm, bit_idx = port & 0x7;
1770 unsigned mask = (1 << len) - 1; 1773 unsigned mask = (1 << len) - 1;
1774 unsigned long base;
1771 1775
1772 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
1773 if (!tr_seg.p) 1777 if (!tr_seg.p)
1774 return false; 1778 return false;
1775 if (desc_limit_scaled(&tr_seg) < 103) 1779 if (desc_limit_scaled(&tr_seg) < 103)
1776 return false; 1780 return false;
1777 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 1781 base = get_desc_base(&tr_seg);
1778 ctxt->vcpu, NULL); 1782#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32;
1784#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
1779 if (r != X86EMUL_CONTINUE) 1786 if (r != X86EMUL_CONTINUE)
1780 return false; 1787 return false;
1781 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1782 return false; 1789 return false;
1783 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
1784 &perm, 1, ctxt->vcpu, NULL); 1791 NULL);
1785 if (r != X86EMUL_CONTINUE) 1792 if (r != X86EMUL_CONTINUE)
1786 return false; 1793 return false;
1787 if ((perm >> bit_idx) & mask) 1794 if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2126 } 2133 }
2127 2134
2128 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2129 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
2130 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); 2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2131 2138
2132 if (has_error_code) { 2139 if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
2365 D(SrcMem16 | ModRM | Mov | Priv), 2372 D(SrcMem16 | ModRM | Mov | Priv),
2366 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2367}, { 2374}, {
2368 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), 2375 D(SrcNone | ModRM | Priv | VendorSpecific), N,
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific),
2369 D(SrcNone | ModRM | DstMem | Mov), N, 2377 D(SrcNone | ModRM | DstMem | Mov), N,
2370 D(SrcMem16 | ModRM | Mov | Priv), N, 2378 D(SrcMem16 | ModRM | Mov | Priv), N,
2371} }; 2379} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
2489static struct opcode twobyte_table[256] = { 2497static struct opcode twobyte_table[256] = {
2490 /* 0x00 - 0x0F */ 2498 /* 0x00 - 0x0F */
2491 N, GD(0, &group7), N, N, 2499 N, GD(0, &group7), N, N,
2492 N, D(ImplicitOps), D(ImplicitOps | Priv), N, 2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
2493 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2494 N, D(ImplicitOps | ModRM), N, N, 2502 N, D(ImplicitOps | ModRM), N, N,
2495 /* 0x10 - 0x1F */ 2503 /* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
2502 /* 0x30 - 0x3F */ 2510 /* 0x30 - 0x3F */
2503 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2504 D(ImplicitOps | Priv), N, 2512 D(ImplicitOps | Priv), N,
2505 D(ImplicitOps), D(ImplicitOps | Priv), N, N, 2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N,
2506 N, N, N, N, N, N, N, N, 2515 N, N, N, N, N, N, N, N,
2507 /* 0x40 - 0x4F */ 2516 /* 0x40 - 0x4F */
2508 X16(D(DstReg | SrcMem | ModRM | Mov)), 2517 X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
2741 if (c->d == 0 || (c->d & Undefined)) 2750 if (c->d == 0 || (c->d & Undefined))
2742 return -1; 2751 return -1;
2743 2752
2753 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
2754 return -1;
2755
2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2756 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2745 c->op_bytes = 8; 2757 c->op_bytes = 8;
2746 2758
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
70 kvm_make_request(KVM_REQ_EVENT, found); 67 kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
75static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
76{ 73{
77 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
78 s->isr_ack |= (1 << irq);
79 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
80 irq += 8; 76 irq += 8;
81 /* 77 /*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
89 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
90} 86}
91 87
92void kvm_pic_clear_isr_ack(struct kvm *kvm)
93{
94 struct kvm_pic *s = pic_irqchip(kvm);
95
96 pic_lock(s);
97 s->pics[0].isr_ack = 0xff;
98 s->pics[1].isr_ack = 0xff;
99 pic_unlock(s);
100}
101
102/* 88/*
103 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
104 */ 90 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->irr = 0; 267 s->irr = 0;
282 s->imr = 0; 268 s->imr = 0;
283 s->isr = 0; 269 s->isr = 0;
284 s->isr_ack = 0xff;
285 s->priority_add = 0; 270 s->priority_add = 0;
286 s->irq_base = 0; 271 s->irq_base = 0;
287 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
545 */ 530 */
546static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
547{ 532{
548 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
549 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
550 int irq = pic_get_irq(&s->pics[0]);
551 534
552 s->output = level; 535 if (!s->output)
553 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
554 s->pics[0].isr_ack &= ~(1 << irq);
555 s->wakeup_needed = true; 536 s->wakeup_needed = true;
556 } 537 s->output = level;
557} 538}
558 539
559static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 557 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 558 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
580 559
581 /* 560 /*
582 * Initialize PIO device 561 * Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
417 case APIC_DM_INIT: 417 case APIC_DM_INIT:
418 if (level) { 418 if (level) {
419 result = 1; 419 result = 1;
420 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
421 printk(KERN_DEBUG
422 "INIT on a runnable vcpu %d\n",
423 vcpu->vcpu_id);
424 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
425 kvm_make_request(KVM_REQ_EVENT, vcpu); 421 kvm_make_request(KVM_REQ_EVENT, vcpu);
426 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
875 871
876 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
877 873
878 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
879 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
880 876
881 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
882} 878}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1065 1061
1066 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1067 1063
1068 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1069 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1070 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1071 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1072 goto nomem_free_apic; 1068 goto nomem_free_apic;
1073 } 1069 }
1074 apic->regs = page_address(apic->regs_page);
1075 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1076 1071
1077 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..22fae7593ee7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
111#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119 116
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
123#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125 122
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
379static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
380 int min) 375 int min)
381{ 376{
382 struct page *page; 377 void *page;
383 378
384 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
385 return 0; 380 return 0;
386 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
387 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
388 if (!page) 383 if (!page)
389 return -ENOMEM; 384 return -ENOMEM;
390 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
391 } 386 }
392 return 0; 387 return 0;
393} 388}
@@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 549 return ret;
555} 550}
556 551
557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
558{ 555{
559 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
560 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 557
561 if (slot && slot->dirty_bitmap) 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
562 return true; 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
563 return false; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
564} 569}
565 570
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1032 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
1033 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
1034 list_del(&sp->link); 1039 list_del(&sp->link);
1035 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
1036 if (!sp->role.direct) 1041 if (!sp->role.direct)
1037 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
1038 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
1039 kvm_mod_used_mmu_pages(kvm, -1); 1044 kvm_mod_used_mmu_pages(kvm, -1);
1040} 1045}
@@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1199{ 1204{
1200} 1205}
1201 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq)
1210{
1211 WARN_ON(1);
1212}
1213
1202#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1203 1215
1204struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2150{ 2162{
2151} 2163}
2152 2164
2153static struct kvm_memory_slot *
2154pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2155{
2156 struct kvm_memory_slot *slot;
2157
2158 slot = gfn_to_memslot(vcpu->kvm, gfn);
2159 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2160 (no_dirty_log && slot->dirty_bitmap))
2161 slot = NULL;
2162
2163 return slot;
2164}
2165
2166static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2167 bool no_dirty_log) 2166 bool no_dirty_log)
2168{ 2167{
2169 struct kvm_memory_slot *slot; 2168 struct kvm_memory_slot *slot;
2170 unsigned long hva; 2169 unsigned long hva;
2171 2170
2172 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); 2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2173 if (!slot) { 2172 if (!slot) {
2174 get_page(bad_page); 2173 get_page(bad_page);
2175 return page_to_pfn(bad_page); 2174 return page_to_pfn(bad_page);
@@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2190 gfn_t gfn; 2189 gfn_t gfn;
2191 2190
2192 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2193 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) 2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2194 return -1; 2193 return -1;
2195 2194
2196 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2804 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2805 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2806 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2807 context->root_level = 0; 2807 context->root_level = 0;
2808 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
@@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2933 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2936 context->free = paging_free; 2937 context->free = paging_free;
2937 context->root_level = level; 2938 context->root_level = level;
2938 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
@@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2961 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2962 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2963 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2964 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2965 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2966 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
@@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2985 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2986 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2987 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2988 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2989 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2990 context->direct_map = true; 2993 context->direct_map = true;
@@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3089 3092
3090static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3091{ 3094{
3092 vcpu->arch.update_pte.pfn = bad_pfn;
3093
3094 if (mmu_is_nested(vcpu)) 3095 if (mmu_is_nested(vcpu))
3095 return init_kvm_nested_mmu(vcpu); 3096 return init_kvm_nested_mmu(vcpu);
3096 else if (tdp_enabled) 3097 else if (tdp_enabled)
@@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3164static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3165 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp,
3166 u64 *spte, 3167 u64 *spte,
3167 const void *new) 3168 const void *new, unsigned long mmu_seq)
3168{ 3169{
3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170 ++vcpu->kvm->stat.mmu_pde_zapped; 3171 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3172 } 3173 }
3173 3174
3174 ++vcpu->kvm->stat.mmu_pte_updated; 3175 ++vcpu->kvm->stat.mmu_pte_updated;
3175 if (!sp->role.cr4_pae) 3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3176 paging32_update_pte(vcpu, sp, spte, new);
3177 else
3178 paging64_update_pte(vcpu, sp, spte, new);
3179} 3177}
3180 3178
3181static bool need_remote_flush(u64 old, u64 new) 3179static bool need_remote_flush(u64 old, u64 new)
@@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3210 return !!(spte && (*spte & shadow_accessed_mask)); 3208 return !!(spte && (*spte & shadow_accessed_mask));
3211} 3209}
3212 3210
3213static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 u64 gpte)
3215{
3216 gfn_t gfn;
3217 pfn_t pfn;
3218
3219 if (!is_present_gpte(gpte))
3220 return;
3221 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3222
3223 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3224 smp_rmb();
3225 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3226
3227 if (is_error_pfn(pfn)) {
3228 kvm_release_pfn_clean(pfn);
3229 return;
3230 }
3231 vcpu->arch.update_pte.gfn = gfn;
3232 vcpu->arch.update_pte.pfn = pfn;
3233}
3234
3235static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3211static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3236{ 3212{
3237 u64 *spte = vcpu->arch.last_pte_updated; 3213 u64 *spte = vcpu->arch.last_pte_updated;
@@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3253 struct kvm_mmu_page *sp; 3229 struct kvm_mmu_page *sp;
3254 struct hlist_node *node; 3230 struct hlist_node *node;
3255 LIST_HEAD(invalid_list); 3231 LIST_HEAD(invalid_list);
3256 u64 entry, gentry; 3232 unsigned long mmu_seq;
3257 u64 *spte; 3233 u64 entry, gentry, *spte;
3258 unsigned offset = offset_in_page(gpa); 3234 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3259 unsigned pte_size; 3235 int level, npte, invlpg_counter, r, flooded = 0;
3260 unsigned page_offset;
3261 unsigned misaligned;
3262 unsigned quadrant;
3263 int level;
3264 int flooded = 0;
3265 int npte;
3266 int r;
3267 int invlpg_counter;
3268 bool remote_flush, local_flush, zap_page; 3236 bool remote_flush, local_flush, zap_page;
3269 3237
3270 zap_page = remote_flush = local_flush = false; 3238 zap_page = remote_flush = local_flush = false;
3239 offset = offset_in_page(gpa);
3271 3240
3272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3241 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3273 3242
@@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3275 3244
3276 /* 3245 /*
3277 * Assume that the pte write on a page table of the same type 3246 * Assume that the pte write on a page table of the same type
3278 * as the current vcpu paging mode. This is nearly always true 3247 * as the current vcpu paging mode since we update the sptes only
3279 * (might be false while changing modes). Note it is verified later 3248 * when they have the same mode.
3280 * by update_pte().
3281 */ 3249 */
3282 if ((is_pae(vcpu) && bytes == 4) || !new) { 3250 if ((is_pae(vcpu) && bytes == 4) || !new) {
3283 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3251 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3303 break; 3271 break;
3304 } 3272 }
3305 3273
3306 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3307 spin_lock(&vcpu->kvm->mmu_lock); 3277 spin_lock(&vcpu->kvm->mmu_lock);
3308 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3309 gentry = 0; 3279 gentry = 0;
3310 kvm_mmu_access_page(vcpu, gfn);
3311 kvm_mmu_free_some_pages(vcpu); 3280 kvm_mmu_free_some_pages(vcpu);
3312 ++vcpu->kvm->stat.mmu_pte_write; 3281 ++vcpu->kvm->stat.mmu_pte_write;
3313 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3282 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3314 if (guest_initiated) { 3283 if (guest_initiated) {
3284 kvm_mmu_access_page(vcpu, gfn);
3315 if (gfn == vcpu->arch.last_pt_write_gfn 3285 if (gfn == vcpu->arch.last_pt_write_gfn
3316 && !last_updated_pte_accessed(vcpu)) { 3286 && !last_updated_pte_accessed(vcpu)) {
3317 ++vcpu->arch.last_pt_write_count; 3287 ++vcpu->arch.last_pt_write_count;
@@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3375 if (gentry && 3345 if (gentry &&
3376 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3377 & mask.word)) 3347 & mask.word))
3378 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 mmu_seq);
3379 if (!remote_flush && need_remote_flush(entry, *spte)) 3350 if (!remote_flush && need_remote_flush(entry, *spte))
3380 remote_flush = true; 3351 remote_flush = true;
3381 ++spte; 3352 ++spte;
@@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3385 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3356 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3357 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3358 spin_unlock(&vcpu->kvm->mmu_lock);
3388 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3389 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3390 vcpu->arch.update_pte.pfn = bad_pfn;
3391 }
3392} 3359}
3393 3360
3394int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3361int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3538 if (!test_bit(slot, sp->slot_bitmap)) 3505 if (!test_bit(slot, sp->slot_bitmap))
3539 continue; 3506 continue;
3540 3507
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3544 pt = sp->spt; 3508 pt = sp->spt;
3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3509 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 if (!is_shadow_present_pte(pt[i]) ||
3511 !is_last_spte(pt[i], sp->role.level))
3512 continue;
3513
3514 if (is_large_pte(pt[i])) {
3515 drop_spte(kvm, &pt[i],
3516 shadow_trap_nonpresent_pte);
3517 --kvm->stat.lpages;
3518 continue;
3519 }
3520
3546 /* avoid RMW */ 3521 /* avoid RMW */
3547 if (is_writable_pte(pt[i])) 3522 if (is_writable_pte(pt[i]))
3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3523 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 }
3549 } 3525 }
3550 kvm_flush_remote_tlbs(kvm); 3526 kvm_flush_remote_tlbs(kvm);
3551} 3527}
@@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 if (nr_to_scan == 0) 3559 if (nr_to_scan == 0)
3584 goto out; 3560 goto out;
3585 3561
3586 spin_lock(&kvm_lock); 3562 raw_spin_lock(&kvm_lock);
3587 3563
3588 list_for_each_entry(kvm, &vm_list, vm_list) { 3564 list_for_each_entry(kvm, &vm_list, vm_list) {
3589 int idx, freed_pages; 3565 int idx, freed_pages;
@@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3606 if (kvm_freed) 3582 if (kvm_freed)
3607 list_move_tail(&kvm_freed->vm_list, &vm_list); 3583 list_move_tail(&kvm_freed->vm_list, &vm_list);
3608 3584
3609 spin_unlock(&kvm_lock); 3585 raw_spin_unlock(&kvm_lock);
3610 3586
3611out: 3587out:
3612 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 3588 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..c6397795d865 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ no_present:
327} 325}
328 326
329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 u64 *spte, const void *pte) 328 u64 *spte, const void *pte, unsigned long mmu_seq)
331{ 329{
332 pt_element_t gpte; 330 pt_element_t gpte;
333 unsigned pte_access; 331 unsigned pte_access;
@@ -339,16 +337,16 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
339 337
340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 338 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 339 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 340 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
341 if (is_error_pfn(pfn)) {
342 kvm_release_pfn_clean(pfn);
343 return; 343 return;
344 pfn = vcpu->arch.update_pte.pfn; 344 }
345 if (is_error_pfn(pfn)) 345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
348 return; 346 return;
349 kvm_get_pfn(pfn); 347
350 /* 348 /*
351 * we call mmu_set_spte() with host_writable = true beacuse that 349 * we call mmu_set_spte() with host_writable = true because that
352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 350 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
353 */ 351 */
354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 352 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
829#undef FNAME 827#undef FNAME
830#undef PT_BASE_ADDR_MASK 828#undef PT_BASE_ADDR_MASK
831#undef PT_INDEX 829#undef PT_INDEX
832#undef PT_LEVEL_MASK
833#undef PT_LVL_ADDR_MASK 830#undef PT_LVL_ADDR_MASK
834#undef PT_LVL_OFFSET_MASK 831#undef PT_LVL_OFFSET_MASK
835#undef PT_LEVEL_BITS 832#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 25bd1bc5aad2..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
135 135
136 u32 *msrpm; 136 u32 *msrpm;
137 137
138 ulong nmi_iret_rip;
139
138 struct nested_state nested; 140 struct nested_state nested;
139 141
140 bool nmi_singlestep; 142 bool nmi_singlestep;
@@ -1150,11 +1152,13 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1150 kvm_load_ldt(svm->host.ldt); 1152 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64 1153#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs); 1154 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1155 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1156 load_gs_index(svm->host.gs);
1155#else 1157#else
1158#ifdef CONFIG_X86_32_LAZY_GS
1156 loadsegment(gs, svm->host.gs); 1159 loadsegment(gs, svm->host.gs);
1157#endif 1160#endif
1161#endif
1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1162 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1163 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1160} 1164}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
2653 ++svm->vcpu.stat.nmi_window_exits; 2657 ++svm->vcpu.stat.nmi_window_exits;
2654 clr_intercept(svm, INTERCEPT_IRET); 2658 clr_intercept(svm, INTERCEPT_IRET);
2655 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2659 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2656 return 1; 2661 return 1;
2657} 2662}
2658 2663
@@ -2777,6 +2782,8 @@ static int dr_interception(struct vcpu_svm *svm)
2777 kvm_register_write(&svm->vcpu, reg, val); 2782 kvm_register_write(&svm->vcpu, reg, val);
2778 } 2783 }
2779 2784
2785 skip_emulated_instruction(&svm->vcpu);
2786
2780 return 1; 2787 return 1;
2781} 2788}
2782 2789
@@ -3472,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3472 3479
3473 svm->int3_injected = 0; 3480 svm->int3_injected = 0;
3474 3481
3475 if (svm->vcpu.arch.hflags & HF_IRET_MASK) { 3482 /*
3483 * If we've made progress since setting HF_IRET_MASK, we've
3484 * executed an IRET and can allow NMI injection.
3485 */
3486 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3476 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3488 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3477 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3489 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3478 } 3490 }
@@ -3639,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 3651 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3640#else 3652#else
3641 loadsegment(fs, svm->host.fs); 3653 loadsegment(fs, svm->host.fs);
3654#ifndef CONFIG_X86_32_LAZY_GS
3655 loadsegment(gs, svm->host.gs);
3656#endif
3642#endif 3657#endif
3643 3658
3644 reload_tss(vcpu); 3659 reload_tss(vcpu);
3645 3660
3646 local_irq_disable(); 3661 local_irq_disable();
3647 3662
3648 stgi();
3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3663 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3664 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3665 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3666 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654 3667
3668 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669 kvm_before_handle_nmi(&svm->vcpu);
3670
3671 stgi();
3672
3673 /* Any pending NMI will happen here */
3674
3675 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676 kvm_after_handle_nmi(&svm->vcpu);
3677
3655 sync_cr8_to_lapic(vcpu); 3678 sync_cr8_to_lapic(vcpu);
3656 3679
3657 svm->next_rip = 0; 3680 svm->next_rip = 0;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index fc7a101c4a35..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1357d7cf4ec8..db932760ea82 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..5b4cdcbd154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
93 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
94 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
95 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
96 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
97 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
98 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
99 * less than 2^12 cycles 99 * less than 2^12 cycles
100 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
101 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
102 */ 102 */
103#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
106module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 176 return container_of(vcpu, struct vcpu_vmx, vcpu);
177} 177}
178 178
179static int init_rmode(struct kvm *kvm);
180static u64 construct_eptp(unsigned long root_hpa); 179static u64 construct_eptp(unsigned long root_hpa);
181static void kvm_cpu_vmxon(u64 addr); 180static void kvm_cpu_vmxon(u64 addr);
182static void kvm_cpu_vmxoff(void); 181static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 182static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
183static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 184
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
1333 1333
1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335 if (msr & FEATURE_CONTROL_LOCKED) { 1335 if (msr & FEATURE_CONTROL_LOCKED) {
1336 /* launched w/ TXT and VMX disabled */
1336 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1337 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337 && tboot_enabled()) 1338 && tboot_enabled())
1338 return 1; 1339 return 1;
1340 /* launched w/o TXT and VMX only enabled w/ TXT */
1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1341 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1342 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1340 && !tboot_enabled()) { 1343 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 1344 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n"); 1345 "activate TXT before enabling KVM\n");
1343 return 1; 1346 return 1;
1344 } 1347 }
1348 /* launched w/o TXT and VMX disabled */
1349 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1350 && !tboot_enabled())
1351 return 1;
1345 } 1352 }
1346 1353
1347 return 0; 1354 return 0;
1348 /* locked but not enabled */
1349} 1355}
1350 1356
1351static void kvm_cpu_vmxon(u64 addr) 1357static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1683 vmx->emulation_required = 1; 1689 vmx->emulation_required = 1;
1684 vmx->rmode.vm86_active = 0; 1690 vmx->rmode.vm86_active = 0;
1685 1691
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1686 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1695 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1756 vmx->emulation_required = 1; 1763 vmx->emulation_required = 1;
1757 vmx->rmode.vm86_active = 1; 1764 vmx->rmode.vm86_active = 1;
1758 1765
1766 /*
1767 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1768 * vcpu. Call it here with phys address pointing 16M below 4G.
1769 */
1770 if (!vcpu->kvm->arch.tss_addr) {
1771 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1772 "called before entering vcpu\n");
1773 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1774 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 }
1777
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1759 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761 1781
@@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1794 1814
1795continue_rmode: 1815continue_rmode:
1796 kvm_mmu_reset_context(vcpu); 1816 kvm_mmu_reset_context(vcpu);
1797 init_rmode(vcpu->kvm);
1798} 1817}
1799 1818
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1819static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2030 vmcs_writel(GUEST_CR4, hw_cr4); 2049 vmcs_writel(GUEST_CR4, hw_cr4);
2031} 2050}
2032 2051
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037 return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu, 2052static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041 struct kvm_segment *var, int seg) 2053 struct kvm_segment *var, int seg)
2042{ 2054{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu);
2043 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save;
2044 u32 ar; 2058 u32 ar;
2045 2059
2060 if (vmx->rmode.vm86_active
2061 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2062 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2063 || seg == VCPU_SREG_GS)
2064 && !emulate_invalid_guest_state) {
2065 switch (seg) {
2066 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2067 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2068 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2069 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2070 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2071 default: BUG();
2072 }
2073 var->selector = save->selector;
2074 var->base = save->base;
2075 var->limit = save->limit;
2076 ar = save->ar;
2077 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector))
2079 goto use_saved_rmode_seg;
2080 }
2046 var->base = vmcs_readl(sf->base); 2081 var->base = vmcs_readl(sf->base);
2047 var->limit = vmcs_read32(sf->limit); 2082 var->limit = vmcs_read32(sf->limit);
2048 var->selector = vmcs_read16(sf->selector); 2083 var->selector = vmcs_read16(sf->selector);
2049 ar = vmcs_read32(sf->ar_bytes); 2084 ar = vmcs_read32(sf->ar_bytes);
2085use_saved_rmode_seg:
2050 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051 ar = 0; 2087 ar = 0;
2052 var->type = ar & 15; 2088 var->type = ar & 15;
@@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2060 var->unusable = (ar >> 16) & 1; 2096 var->unusable = (ar >> 16) & 1;
2061} 2097}
2062 2098
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s;
2103
2104 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg);
2106 return s.base;
2107 }
2108 return vmcs_readl(sf->base);
2109}
2110
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2111static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{ 2112{
2065 if (!is_protmode(vcpu)) 2113 if (!is_protmode(vcpu))
@@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2101 u32 ar; 2149 u32 ar;
2102 2150
2103 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector);
2104 vmx->rmode.tr.selector = var->selector; 2153 vmx->rmode.tr.selector = var->selector;
2105 vmx->rmode.tr.base = var->base; 2154 vmx->rmode.tr.base = var->base;
2106 vmx->rmode.tr.limit = var->limit; 2155 vmx->rmode.tr.limit = var->limit;
@@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2361 2410
2362static int init_rmode_tss(struct kvm *kvm) 2411static int init_rmode_tss(struct kvm *kvm)
2363{ 2412{
2364 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2413 gfn_t fn;
2365 u16 data = 0; 2414 u16 data = 0;
2366 int ret = 0; 2415 int r, idx, ret = 0;
2367 int r;
2368 2416
2417 idx = srcu_read_lock(&kvm->srcu);
2418 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2369 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2419 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370 if (r < 0) 2420 if (r < 0)
2371 goto out; 2421 goto out;
@@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
2389 2439
2390 ret = 1; 2440 ret = 1;
2391out: 2441out:
2442 srcu_read_unlock(&kvm->srcu, idx);
2392 return ret; 2443 return ret;
2393} 2444}
2394 2445
2395static int init_rmode_identity_map(struct kvm *kvm) 2446static int init_rmode_identity_map(struct kvm *kvm)
2396{ 2447{
2397 int i, r, ret; 2448 int i, idx, r, ret;
2398 pfn_t identity_map_pfn; 2449 pfn_t identity_map_pfn;
2399 u32 tmp; 2450 u32 tmp;
2400 2451
@@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2409 return 1; 2460 return 1;
2410 ret = 0; 2461 ret = 0;
2411 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2462 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2463 idx = srcu_read_lock(&kvm->srcu);
2412 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2464 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413 if (r < 0) 2465 if (r < 0)
2414 goto out; 2466 goto out;
@@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2424 kvm->arch.ept_identity_pagetable_done = true; 2476 kvm->arch.ept_identity_pagetable_done = true;
2425 ret = 1; 2477 ret = 1;
2426out: 2478out:
2479 srcu_read_unlock(&kvm->srcu, idx);
2427 return ret; 2480 return ret;
2428} 2481}
2429 2482
@@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2699 return 0; 2752 return 0;
2700} 2753}
2701 2754
2702static int init_rmode(struct kvm *kvm)
2703{
2704 int idx, ret = 0;
2705
2706 idx = srcu_read_lock(&kvm->srcu);
2707 if (!init_rmode_tss(kvm))
2708 goto exit;
2709 if (!init_rmode_identity_map(kvm))
2710 goto exit;
2711
2712 ret = 1;
2713exit:
2714 srcu_read_unlock(&kvm->srcu, idx);
2715 return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2755static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{ 2756{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2757 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2722 int ret; 2759 int ret;
2723 2760
2724 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2761 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725 if (!init_rmode(vmx->vcpu.kvm)) {
2726 ret = -ENOMEM;
2727 goto out;
2728 }
2729 2762
2730 vmx->rmode.vm86_active = 0; 2763 vmx->rmode.vm86_active = 0;
2731 2764
@@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2839 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2841 __pa(vmx->vcpu.arch.apic->regs));
2809 vmcs_write32(TPR_THRESHOLD, 0); 2842 vmcs_write32(TPR_THRESHOLD, 0);
2810 } 2843 }
2811 2844
@@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2971 if (ret) 3004 if (ret)
2972 return ret; 3005 return ret;
2973 kvm->arch.tss_addr = addr; 3006 kvm->arch.tss_addr = addr;
3007 if (!init_rmode_tss(kvm))
3008 return -ENOMEM;
3009
2974 return 0; 3010 return 0;
2975} 3011}
2976 3012
@@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3962#define Q "l" 3998#define Q "l"
3963#endif 3999#endif
3964 4000
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4001static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{ 4002{
3967 struct vcpu_vmx *vmx = to_vmx(vcpu); 4003 struct vcpu_vmx *vmx = to_vmx(vcpu);
3968 4004
@@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3991 asm( 4027 asm(
3992 /* Store host registers */ 4028 /* Store host registers */
3993 "push %%"R"dx; push %%"R"bp;" 4029 "push %%"R"dx; push %%"R"bp;"
4030 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3994 "push %%"R"cx \n\t" 4031 "push %%"R"cx \n\t"
3995 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4032 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996 "je 1f \n\t" 4033 "je 1f \n\t"
@@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4069 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033 ".Lkvm_vmx_return: " 4070 ".Lkvm_vmx_return: "
4034 /* Save guest registers, load host registers, keep flags */ 4071 /* Save guest registers, load host registers, keep flags */
4035 "xchg %0, (%%"R"sp) \n\t" 4072 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4073 "pop %0 \n\t"
4036 "mov %%"R"ax, %c[rax](%0) \n\t" 4074 "mov %%"R"ax, %c[rax](%0) \n\t"
4037 "mov %%"R"bx, %c[rbx](%0) \n\t" 4075 "mov %%"R"bx, %c[rbx](%0) \n\t"
4038 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4076 "pop"Q" %c[rcx](%0) \n\t"
4039 "mov %%"R"dx, %c[rdx](%0) \n\t" 4077 "mov %%"R"dx, %c[rdx](%0) \n\t"
4040 "mov %%"R"si, %c[rsi](%0) \n\t" 4078 "mov %%"R"si, %c[rsi](%0) \n\t"
4041 "mov %%"R"di, %c[rdi](%0) \n\t" 4079 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4053 "mov %%cr2, %%"R"ax \n\t" 4091 "mov %%cr2, %%"R"ax \n\t"
4054 "mov %%"R"ax, %c[cr2](%0) \n\t" 4092 "mov %%"R"ax, %c[cr2](%0) \n\t"
4055 4093
4056 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4094 "pop %%"R"bp; pop %%"R"dx \n\t"
4057 "setbe %c[fail](%0) \n\t" 4095 "setbe %c[fail](%0) \n\t"
4058 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4096 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4097 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4076 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4114 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4115 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif 4116#endif
4079 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4117 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4118 [wordsize]"i"(sizeof(ulong))
4080 : "cc", "memory" 4119 : "cc", "memory"
4081 , R"ax", R"bx", R"di", R"si" 4120 , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64 4121#ifdef CONFIG_X86_64
@@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4183 if (!kvm->arch.ept_identity_map_addr) 4222 if (!kvm->arch.ept_identity_map_addr)
4184 kvm->arch.ept_identity_map_addr = 4223 kvm->arch.ept_identity_map_addr =
4185 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4224 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4225 err = -ENOMEM;
4186 if (alloc_identity_pagetable(kvm) != 0) 4226 if (alloc_identity_pagetable(kvm) != 0)
4187 goto free_vmcs; 4227 goto free_vmcs;
4228 if (!init_rmode_identity_map(kvm))
4229 goto free_vmcs;
4188 } 4230 }
4189 4231
4190 return &vmx->vcpu; 4232 return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..934b4c6b0bf9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
81 * - enable LME and LMA per default on 64 bit KVM 81 * - enable LME and LMA per default on 64 bit KVM
82 */ 82 */
83#ifdef CONFIG_X86_64 83#ifdef CONFIG_X86_64
84static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 84static
85u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
85#else 86#else
86static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
87#endif 88#endif
88 89
89#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
360 361
361void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362void kvm_inject_nmi(struct kvm_vcpu *vcpu)
362{ 363{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
363 kvm_make_request(KVM_REQ_EVENT, vcpu); 365 kvm_make_request(KVM_REQ_EVENT, vcpu);
364 vcpu->arch.nmi_pending = 1;
365} 366}
366EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367EXPORT_SYMBOL_GPL(kvm_inject_nmi);
367 368
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
525 526
526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 kvm_x86_ops->set_cr0(vcpu, cr0);
527 528
528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
529 kvm_clear_async_pf_completion_queue(vcpu); 530 kvm_clear_async_pf_completion_queue(vcpu);
531 kvm_async_pf_hash_reset(vcpu);
532 }
530 533
531 if ((cr0 ^ old_cr0) & update_bits) 534 if ((cr0 ^ old_cr0) & update_bits)
532 kvm_mmu_reset_context(vcpu); 535 kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1017 unsigned long flags; 1020 unsigned long flags;
1018 s64 sdiff; 1021 s64 sdiff;
1019 1022
1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1021 offset = data - native_read_tsc(); 1024 offset = data - native_read_tsc();
1022 ns = get_kernel_ns(); 1025 ns = get_kernel_ns();
1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1026 elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1028,7 +1031,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1028 /* 1031 /*
1029 * Special case: close write to TSC within 5 seconds of 1032 * Special case: close write to TSC within 5 seconds of
1030 * another CPU is interpreted as an attempt to synchronize 1033 * another CPU is interpreted as an attempt to synchronize
1031 * The 5 seconds is to accomodate host load / swapping as 1034 * The 5 seconds is to accommodate host load / swapping as
1032 * well as any reset of TSC during the boot process. 1035 * well as any reset of TSC during the boot process.
1033 * 1036 *
1034 * In that case, for a reliable TSC, we can match TSC offsets, 1037 * In that case, for a reliable TSC, we can match TSC offsets,
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1050 kvm->arch.last_tsc_write = data; 1053 kvm->arch.last_tsc_write = data;
1051 kvm->arch.last_tsc_offset = offset; 1054 kvm->arch.last_tsc_offset = offset;
1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1055 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1056 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1054 1057
1055 /* Reset of TSC must disable overshoot protection below */ 1058 /* Reset of TSC must disable overshoot protection below */
1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1059 vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1453 return 0; 1456 return 0;
1454} 1457}
1455 1458
1459static void kvmclock_reset(struct kvm_vcpu *vcpu)
1460{
1461 if (vcpu->arch.time_page) {
1462 kvm_release_page_dirty(vcpu->arch.time_page);
1463 vcpu->arch.time_page = NULL;
1464 }
1465}
1466
1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1467int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1457{ 1468{
1458 switch (msr) { 1469 switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1510 break; 1521 break;
1511 case MSR_KVM_SYSTEM_TIME_NEW: 1522 case MSR_KVM_SYSTEM_TIME_NEW:
1512 case MSR_KVM_SYSTEM_TIME: { 1523 case MSR_KVM_SYSTEM_TIME: {
1513 if (vcpu->arch.time_page) { 1524 kvmclock_reset(vcpu);
1514 kvm_release_page_dirty(vcpu->arch.time_page);
1515 vcpu->arch.time_page = NULL;
1516 }
1517 1525
1518 vcpu->arch.time = data; 1526 vcpu->arch.time = data;
1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1527 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1592 } else 1600 } else
1593 return set_msr_hyperv(vcpu, msr, data); 1601 return set_msr_hyperv(vcpu, msr, data);
1594 break; 1602 break;
1603 case MSR_IA32_BBL_CR_CTL3:
1604 /* Drop writes to this legacy MSR -- see rdmsr
1605 * counterpart for further detail.
1606 */
1607 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1608 break;
1595 default: 1609 default:
1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1610 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1597 return xen_hvm_config(vcpu, data); 1611 return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 } else 1860 } else
1847 return get_msr_hyperv(vcpu, msr, pdata); 1861 return get_msr_hyperv(vcpu, msr, pdata);
1848 break; 1862 break;
1863 case MSR_IA32_BBL_CR_CTL3:
1864 /* This legacy MSR exists but isn't fully documented in current
1865 * silicon. It is however accessed by winxp in very narrow
1866 * scenarios where it sets bit #19, itself documented as
1867 * a "reserved" bit. Best effort attempt to source coherent
1868 * read data here should the balance of the register be
1869 * interpreted by the guest:
1870 *
1871 * L2 cache control register 3: 64GB range, 256KB size,
1872 * enabled, latency 0x1, configured
1873 */
1874 data = 0xbe702111;
1875 break;
1849 default: 1876 default:
1850 if (!ignore_msrs) { 1877 if (!ignore_msrs) {
1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2100 if (check_tsc_unstable()) { 2127 if (check_tsc_unstable()) {
2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2128 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2102 vcpu->arch.tsc_catchup = 1; 2129 vcpu->arch.tsc_catchup = 1;
2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2104 } 2130 }
2131 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2105 if (vcpu->cpu != cpu) 2132 if (vcpu->cpu != cpu)
2106 kvm_migrate_timers(vcpu); 2133 kvm_migrate_timers(vcpu);
2107 vcpu->cpu = cpu; 2134 vcpu->cpu = cpu;
@@ -2368,9 +2395,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2368 int i; 2395 int i;
2369 2396
2370 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2397 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2371 for (i = 1; *nent < maxnent; ++i) { 2398 for (i = 1; *nent < maxnent && i < 64; ++i) {
2372 if (entry[i - 1].eax == 0 && i != 2) 2399 if (entry[i].eax == 0)
2373 break; 2400 continue;
2374 do_cpuid_1_ent(&entry[i], function, i); 2401 do_cpuid_1_ent(&entry[i], function, i);
2375 entry[i].flags |= 2402 entry[i].flags |=
2376 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2403 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2575 if (mce->status & MCI_STATUS_UC) { 2602 if (mce->status & MCI_STATUS_UC) {
2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2603 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2604 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2578 printk(KERN_DEBUG "kvm: set_mce: "
2579 "injects mce exception while "
2580 "previous one is in progress!\n");
2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2605 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2582 return 0; 2606 return 0;
2583 } 2607 }
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2672 vcpu->arch.interrupt.pending = events->interrupt.injected;
2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2673 vcpu->arch.interrupt.nr = events->interrupt.nr;
2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2674 vcpu->arch.interrupt.soft = events->interrupt.soft;
2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2652 kvm_pic_clear_isr_ack(vcpu->kvm);
2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2675 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2676 kvm_x86_ops->set_interrupt_shadow(vcpu,
2655 events->interrupt.shadow); 2677 events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
4140 return get_segment_base(vcpu, seg); 4162 return get_segment_base(vcpu, seg);
4141} 4163}
4142 4164
4143static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4144 struct kvm_vcpu *vcpu) 4166 int seg, struct kvm_vcpu *vcpu)
4145{ 4167{
4146 struct kvm_segment var; 4168 struct kvm_segment var;
4147 4169
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4154 var.limit >>= 12; 4176 var.limit >>= 12;
4155 set_desc_limit(desc, var.limit); 4177 set_desc_limit(desc, var.limit);
4156 set_desc_base(desc, (unsigned long)var.base); 4178 set_desc_base(desc, (unsigned long)var.base);
4179#ifdef CONFIG_X86_64
4180 if (base3)
4181 *base3 = var.base >> 32;
4182#endif
4157 desc->type = var.type; 4183 desc->type = var.type;
4158 desc->s = var.s; 4184 desc->s = var.s;
4159 desc->dpl = var.dpl; 4185 desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4166 return true; 4192 return true;
4167} 4193}
4168 4194
4169static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4170 struct kvm_vcpu *vcpu) 4196 int seg, struct kvm_vcpu *vcpu)
4171{ 4197{
4172 struct kvm_segment var; 4198 struct kvm_segment var;
4173 4199
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
4175 kvm_get_segment(vcpu, &var, seg); 4201 kvm_get_segment(vcpu, &var, seg);
4176 4202
4177 var.base = get_desc_base(desc); 4203 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32;
4206#endif
4178 var.limit = get_desc_limit(desc); 4207 var.limit = get_desc_limit(desc);
4179 if (desc->g) 4208 if (desc->g)
4180 var.limit = (var.limit << 12) | 0xfff; 4209 var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4390 vcpu->arch.emulate_ctxt.have_exception = false; 4419 vcpu->arch.emulate_ctxt.have_exception = false;
4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4420 vcpu->arch.emulate_ctxt.perm_ok = false;
4392 4421
4422 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4423 = emulation_type & EMULTYPE_TRAP_UD;
4424
4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4425 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4394 if (r == X86EMUL_PROPAGATE_FAULT)
4395 goto done;
4396 4426
4397 trace_kvm_emulate_insn_start(vcpu); 4427 trace_kvm_emulate_insn_start(vcpu);
4398
4399 /* Only allow emulation of specific instructions on #UD
4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/
4401 if (emulation_type & EMULTYPE_TRAP_UD) {
4402 if (!c->twobyte)
4403 return EMULATE_FAIL;
4404 switch (c->b) {
4405 case 0x01: /* VMMCALL */
4406 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4407 return EMULATE_FAIL;
4408 break;
4409 case 0x34: /* sysenter */
4410 case 0x35: /* sysexit */
4411 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4412 return EMULATE_FAIL;
4413 break;
4414 case 0x05: /* syscall */
4415 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4416 return EMULATE_FAIL;
4417 break;
4418 default:
4419 return EMULATE_FAIL;
4420 }
4421
4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
4423 return EMULATE_FAIL;
4424 }
4425
4426 ++vcpu->stat.insn_emulation; 4428 ++vcpu->stat.insn_emulation;
4427 if (r) { 4429 if (r) {
4430 if (emulation_type & EMULTYPE_TRAP_UD)
4431 return EMULATE_FAIL;
4428 if (reexecute_instruction(vcpu, cr2)) 4432 if (reexecute_instruction(vcpu, cr2))
4429 return EMULATE_DONE; 4433 return EMULATE_DONE;
4430 if (emulation_type & EMULTYPE_SKIP) 4434 if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
4452 return handle_emulation_failure(vcpu); 4456 return handle_emulation_failure(vcpu);
4453 } 4457 }
4454 4458
4455done:
4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4459 if (vcpu->arch.emulate_ctxt.have_exception) {
4457 inject_emulated_exception(vcpu); 4460 inject_emulated_exception(vcpu);
4458 r = EMULATE_DONE; 4461 r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4562 4565
4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4566 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4564 4567
4565 spin_lock(&kvm_lock); 4568 raw_spin_lock(&kvm_lock);
4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4569 list_for_each_entry(kvm, &vm_list, vm_list) {
4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4570 kvm_for_each_vcpu(i, vcpu, kvm) {
4568 if (vcpu->cpu != freq->cpu) 4571 if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4572 send_ipi = 1; 4575 send_ipi = 1;
4573 } 4576 }
4574 } 4577 }
4575 spin_unlock(&kvm_lock); 4578 raw_spin_unlock(&kvm_lock);
4576 4579
4577 if (freq->old < freq->new && send_ipi) { 4580 if (freq->old < freq->new && send_ipi) {
4578 /* 4581 /*
@@ -4955,12 +4958,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
4955 best = e; 4958 best = e;
4956 break; 4959 break;
4957 } 4960 }
4958 /*
4959 * Both basic or both extended?
4960 */
4961 if (((e->function ^ function) & 0x80000000) == 0)
4962 if (!best || e->function > best->function)
4963 best = e;
4964 } 4961 }
4965 return best; 4962 return best;
4966} 4963}
@@ -4980,6 +4977,27 @@ not_found:
4980 return 36; 4977 return 36;
4981} 4978}
4982 4979
4980/*
4981 * If no match is found, check whether we exceed the vCPU's limit
4982 * and return the content of the highest valid _standard_ leaf instead.
4983 * This is to satisfy the CPUID specification.
4984 */
4985static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
4986 u32 function, u32 index)
4987{
4988 struct kvm_cpuid_entry2 *maxlevel;
4989
4990 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
4991 if (!maxlevel || maxlevel->eax >= function)
4992 return NULL;
4993 if (function & 0x80000000) {
4994 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
4995 if (!maxlevel)
4996 return NULL;
4997 }
4998 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
4999}
5000
4983void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 5001void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4984{ 5002{
4985 u32 function, index; 5003 u32 function, index;
@@ -4992,6 +5010,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4992 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 5010 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
4993 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 5011 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
4994 best = kvm_find_cpuid_entry(vcpu, function, index); 5012 best = kvm_find_cpuid_entry(vcpu, function, index);
5013
5014 if (!best)
5015 best = check_cpuid_limit(vcpu, function, index);
5016
4995 if (best) { 5017 if (best) {
4996 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 5018 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
4997 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 5019 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -5185,6 +5207,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5185 r = 1; 5207 r = 1;
5186 goto out; 5208 goto out;
5187 } 5209 }
5210 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5211 vcpu->arch.nmi_pending = true;
5188 } 5212 }
5189 5213
5190 r = kvm_mmu_reload(vcpu); 5214 r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5237,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5213 kvm_load_guest_fpu(vcpu); 5237 kvm_load_guest_fpu(vcpu);
5214 kvm_load_guest_xcr0(vcpu); 5238 kvm_load_guest_xcr0(vcpu);
5215 5239
5216 atomic_set(&vcpu->guest_mode, 1); 5240 vcpu->mode = IN_GUEST_MODE;
5217 smp_wmb(); 5241
5242 /* We should set ->mode before check ->requests,
5243 * see the comment in make_all_cpus_request.
5244 */
5245 smp_mb();
5218 5246
5219 local_irq_disable(); 5247 local_irq_disable();
5220 5248
5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5249 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5222 || need_resched() || signal_pending(current)) { 5250 || need_resched() || signal_pending(current)) {
5223 atomic_set(&vcpu->guest_mode, 0); 5251 vcpu->mode = OUTSIDE_GUEST_MODE;
5224 smp_wmb(); 5252 smp_wmb();
5225 local_irq_enable(); 5253 local_irq_enable();
5226 preempt_enable(); 5254 preempt_enable();
@@ -5256,7 +5284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5256 5284
5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5285 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5258 5286
5259 atomic_set(&vcpu->guest_mode, 0); 5287 vcpu->mode = OUTSIDE_GUEST_MODE;
5260 smp_wmb(); 5288 smp_wmb();
5261 local_irq_enable(); 5289 local_irq_enable();
5262 5290
@@ -5574,7 +5602,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5574 struct kvm_sregs *sregs) 5602 struct kvm_sregs *sregs)
5575{ 5603{
5576 int mmu_reset_needed = 0; 5604 int mmu_reset_needed = 0;
5577 int pending_vec, max_bits; 5605 int pending_vec, max_bits, idx;
5578 struct desc_ptr dt; 5606 struct desc_ptr dt;
5579 5607
5580 dt.size = sregs->idt.limit; 5608 dt.size = sregs->idt.limit;
@@ -5603,10 +5631,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5631 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5632 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu); 5633 update_cpuid(vcpu);
5634
5635 idx = srcu_read_lock(&vcpu->kvm->srcu);
5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5636 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5637 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5608 mmu_reset_needed = 1; 5638 mmu_reset_needed = 1;
5609 } 5639 }
5640 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5610 5641
5611 if (mmu_reset_needed) 5642 if (mmu_reset_needed)
5612 kvm_mmu_reset_context(vcpu); 5643 kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5648,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5617 if (pending_vec < max_bits) { 5648 if (pending_vec < max_bits) {
5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5649 kvm_queue_interrupt(vcpu, pending_vec, false);
5619 pr_debug("Set back pending irq %d\n", pending_vec); 5650 pr_debug("Set back pending irq %d\n", pending_vec);
5620 if (irqchip_in_kernel(vcpu->kvm))
5621 kvm_pic_clear_isr_ack(vcpu->kvm);
5622 } 5651 }
5623 5652
5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5653 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5843,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5814 5843
5815void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5844void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5816{ 5845{
5817 if (vcpu->arch.time_page) { 5846 kvmclock_reset(vcpu);
5818 kvm_release_page_dirty(vcpu->arch.time_page);
5819 vcpu->arch.time_page = NULL;
5820 }
5821 5847
5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5848 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5823 fx_free(vcpu); 5849 fx_free(vcpu);
@@ -5878,6 +5904,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5904 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0; 5905 vcpu->arch.apf.msr_val = 0;
5880 5906
5907 kvmclock_reset(vcpu);
5908
5881 kvm_clear_async_pf_completion_queue(vcpu); 5909 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu); 5910 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false; 5911 vcpu->arch.apf.halted = false;
@@ -6005,7 +6033,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6033 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6034 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6007 6035
6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6036 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6009 6037
6010 return 0; 6038 return 0;
6011} 6039}
@@ -6103,7 +6131,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6103 int user_alloc) 6131 int user_alloc)
6104{ 6132{
6105 6133
6106 int npages = mem->memory_size >> PAGE_SHIFT; 6134 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6107 6135
6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6136 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6109 int ret; 6137 int ret;
@@ -6118,12 +6146,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6118 "failed to munmap memory\n"); 6146 "failed to munmap memory\n");
6119 } 6147 }
6120 6148
6149 if (!kvm->arch.n_requested_mmu_pages)
6150 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6151
6121 spin_lock(&kvm->mmu_lock); 6152 spin_lock(&kvm->mmu_lock);
6122 if (!kvm->arch.n_requested_mmu_pages) { 6153 if (nr_mmu_pages)
6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6154 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6125 }
6126
6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6155 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6128 spin_unlock(&kvm->mmu_lock); 6156 spin_unlock(&kvm->mmu_lock);
6129} 6157}
@@ -6157,7 +6185,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6157 6185
6158 me = get_cpu(); 6186 me = get_cpu();
6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6187 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6188 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6161 smp_send_reschedule(cpu); 6189 smp_send_reschedule(cpu);
6162 put_cpu(); 6190 put_cpu();
6163} 6191}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 select VIRTUALIZATION
5 select VIRTIO 6 select VIRTIO
6 select VIRTIO_RING 7 select VIRTIO_RING
7 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4996cf5f73a0..1cd608973ce5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void)
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 397 * instead we just use the real "cpuid" instruction. Then I pretty much turned
398 * off feature bits until the Guest booted. (Don't say that: you'll damage 398 * off feature bits until the Guest booted. (Don't say that: you'll damage
399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
400 * hardly future proof.) Noone's listening! They don't like you anyway, 400 * hardly future proof.) No one's listening! They don't like you anyway,
401 * parenthetic weirdo! 401 * parenthetic weirdo!
402 * 402 *
403 * Replacing the cpuid so we can turn features off is great for the kernel, but 403 * Replacing the cpuid so we can turn features off is great for the kernel, but
@@ -824,7 +824,7 @@ static void __init lguest_init_IRQ(void)
824 824
825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
826 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 826 /* Some systems map "vectors" to interrupts weirdly. Not us! */
827 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 827 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
828 if (i != SYSCALL_VECTOR) 828 if (i != SYSCALL_VECTOR)
829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
830 } 830 }
@@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void)
847void lguest_setup_irq(unsigned int irq) 847void lguest_setup_irq(unsigned int irq)
848{ 848{
849 irq_alloc_desc_at(irq, 0); 849 irq_alloc_desc_at(irq, 0);
850 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 850 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
851 handle_level_irq, "level"); 851 handle_level_irq, "level");
852} 852}
853 853
@@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
995static void lguest_time_init(void) 995static void lguest_time_init(void)
996{ 996{
997 /* Set up the timer interrupt (0) to go to our simple timer routine */ 997 /* Set up the timer interrupt (0) to go to our simple timer routine */
998 set_irq_handler(0, lguest_time_irq); 998 irq_set_handler(0, lguest_time_irq);
999 999
1000 clocksource_register(&lguest_clock); 1000 clocksource_register(&lguest_clock);
1001 1001
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
42 lib-y += memmove_64.o memset_64.o 42 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o 44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o
45endif 46endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e65..e8e7e0d06f42 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
15 15
16/* if you want SMP support, implement these with real spinlocks */ 16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg 17.macro LOCK reg
18 pushfl 18 pushfl_cfi
19 CFI_ADJUST_CFA_OFFSET 4
20 cli 19 cli
21.endm 20.endm
22 21
23.macro UNLOCK reg 22.macro UNLOCK reg
24 popfl 23 popfl_cfi
25 CFI_ADJUST_CFA_OFFSET -4
26.endm 24.endm
27 25
28#define BEGIN(op) \ 26#define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de3352..391a083674b4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg 16.macro SAVE reg
17 pushl %\reg 17 pushl_cfi %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0 18 CFI_REL_OFFSET \reg, 0
20.endm 19.endm
21 20
22.macro RESTORE reg 21.macro RESTORE reg
23 popl %\reg 22 popl_cfi %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg 23 CFI_RESTORE \reg
26.endm 24.endm
27 25
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..78d16a554db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
50 */ 50 */
51ENTRY(csum_partial) 51ENTRY(csum_partial)
52 CFI_STARTPROC 52 CFI_STARTPROC
53 pushl %esi 53 pushl_cfi %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0 54 CFI_REL_OFFSET esi, 0
56 pushl %ebx 55 pushl_cfi %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0 56 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum 57 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len 58 movl 16(%esp),%ecx # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
132 jz 8f 130 jz 8f
133 roll $8, %eax 131 roll $8, %eax
1348: 1328:
135 popl %ebx 133 popl_cfi %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx 134 CFI_RESTORE ebx
138 popl %esi 135 popl_cfi %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi 136 CFI_RESTORE esi
141 ret 137 ret
142 CFI_ENDPROC 138 CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
148 144
149ENTRY(csum_partial) 145ENTRY(csum_partial)
150 CFI_STARTPROC 146 CFI_STARTPROC
151 pushl %esi 147 pushl_cfi %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0 148 CFI_REL_OFFSET esi, 0
154 pushl %ebx 149 pushl_cfi %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0 150 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum 151 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len 152 movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
260 jz 90f 254 jz 90f
261 roll $8, %eax 255 roll $8, %eax
26290: 25690:
263 popl %ebx 257 popl_cfi %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx 258 CFI_RESTORE ebx
266 popl %esi 259 popl_cfi %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi 260 CFI_RESTORE esi
269 ret 261 ret
270 CFI_ENDPROC 262 CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC 301 CFI_STARTPROC
310 subl $4,%esp 302 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4 303 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi 304 pushl_cfi %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0 305 CFI_REL_OFFSET edi, 0
315 pushl %esi 306 pushl_cfi %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0 307 CFI_REL_OFFSET esi, 0
318 pushl %ebx 308 pushl_cfi %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0 309 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum 310 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len 311 movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) )
426 415
427.previous 416.previous
428 417
429 popl %ebx 418 popl_cfi %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx 419 CFI_RESTORE ebx
432 popl %esi 420 popl_cfi %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi 421 CFI_RESTORE esi
435 popl %edi 422 popl_cfi %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi 423 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp 424 popl_cfi %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret 425 ret
441 CFI_ENDPROC 426 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic) 427ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
459 444
460ENTRY(csum_partial_copy_generic) 445ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC 446 CFI_STARTPROC
462 pushl %ebx 447 pushl_cfi %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0 448 CFI_REL_OFFSET ebx, 0
465 pushl %edi 449 pushl_cfi %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0 450 CFI_REL_OFFSET edi, 0
468 pushl %esi 451 pushl_cfi %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0 452 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src 453 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst 454 movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) )
527 jmp 7b 509 jmp 7b
528.previous 510.previous
529 511
530 popl %esi 512 popl_cfi %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi 513 CFI_RESTORE esi
533 popl %edi 514 popl_cfi %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi 515 CFI_RESTORE edi
536 popl %ebx 516 popl_cfi %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx 517 CFI_RESTORE ebx
539 ret 518 ret
540 CFI_ENDPROC 519 CFI_ENDPROC
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..1e572c507d06
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8#include <linux/linkage.h>
9#include <asm/alternative-asm.h>
10#include <asm/frame.h>
11#include <asm/dwarf2.h>
12
13#ifdef CONFIG_SMP
14#define SEG_PREFIX %gs:
15#else
16#define SEG_PREFIX
17#endif
18
19.text
20
21/*
22 * Inputs:
23 * %rsi : memory location to compare
24 * %rax : low 64 bits of old value
25 * %rdx : high 64 bits of old value
26 * %rbx : low 64 bits of new value
27 * %rcx : high 64 bits of new value
28 * %al : Operation successful
29 */
30ENTRY(this_cpu_cmpxchg16b_emu)
31CFI_STARTPROC
32
33#
34# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
35# via the ZF. Caller will access %al to get result.
36#
37# Note that this is only useful for a cpuops operation. Meaning that we
38# do *not* have a fully atomic operation but just an operation that is
39# *atomic* on a single cpu (as provided by the this_cpu_xx class of
40# macros).
41#
42this_cpu_cmpxchg16b_emu:
43 pushf
44 cli
45
46 cmpq SEG_PREFIX(%rsi), %rax
47 jne not_same
48 cmpq SEG_PREFIX 8(%rsi), %rdx
49 jne not_same
50
51 movq %rbx, SEG_PREFIX(%rsi)
52 movq %rcx, SEG_PREFIX 8(%rsi)
53
54 popf
55 mov $1, %al
56 ret
57
58 not_same:
59 popf
60 xor %al,%al
61 ret
62
63CFI_ENDPROC
64
65ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..99e482615195 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -117,7 +117,7 @@ ENDPROC(bad_from_user)
117 * rdx count 117 * rdx count
118 * 118 *
119 * Output: 119 * Output:
120 * eax uncopied bytes or 0 if successfull. 120 * eax uncopied bytes or 0 if successful.
121 */ 121 */
122ENTRY(copy_user_generic_unrolled) 122ENTRY(copy_user_generic_unrolled)
123 CFI_STARTPROC 123 CFI_STARTPROC
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 * 3 *
4 * This file is subject to the terms and conditions of the GNU General Public 4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
11 11
12/* 12/*
13 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed. 15 * destination is zeroed.
16 * 16 *
17 * Input 17 * Input
18 * rdi source 18 * rdi source
19 * rsi destination 19 * rsi destination
20 * edx len (32bit) 20 * edx len (32bit)
21 * ecx sum (32bit) 21 * ecx sum (32bit)
22 * r8 src_err_ptr (int) 22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int) 23 * r9 dst_err_ptr (int)
24 * 24 *
25 * Output 25 * Output
26 * eax 64bit sum. undefined in case of exception. 26 * eax 64bit sum. undefined in case of exception.
27 * 27 *
28 * Wrappers need to take care of valid exception sum and zeroing. 28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes. 29 * They also should align source or destination to 8 bytes.
30 */ 30 */
31 31
32 .macro source 32 .macro source
3310: 3310:
34 .section __ex_table,"a" 34 .section __ex_table, "a"
35 .align 8 35 .align 8
36 .quad 10b,.Lbad_source 36 .quad 10b, .Lbad_source
37 .previous 37 .previous
38 .endm 38 .endm
39 39
40 .macro dest 40 .macro dest
4120: 4120:
42 .section __ex_table,"a" 42 .section __ex_table, "a"
43 .align 8 43 .align 8
44 .quad 20b,.Lbad_dest 44 .quad 20b, .Lbad_dest
45 .previous 45 .previous
46 .endm 46 .endm
47 47
48 .macro ignore L=.Lignore 48 .macro ignore L=.Lignore
4930: 4930:
50 .section __ex_table,"a" 50 .section __ex_table, "a"
51 .align 8 51 .align 8
52 .quad 30b,\L 52 .quad 30b, \L
53 .previous 53 .previous
54 .endm 54 .endm
55 55
56 56
57ENTRY(csum_partial_copy_generic) 57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC 58 CFI_STARTPROC
59 cmpl $3*64,%edx 59 cmpl $3*64, %edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8, %rsp
64 CFI_ADJUST_CFA_OFFSET 7*8 64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp) 65 movq %rbx, 2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8 66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp) 67 movq %r12, 3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8 68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp) 69 movq %r14, 4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8 70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp) 71 movq %r13, 5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8 72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp) 73 movq %rbp, 6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8 74 CFI_REL_OFFSET rbp, 6*8
75 75
76 movq %r8,(%rsp) 76 movq %r8, (%rsp)
77 movq %r9,1*8(%rsp) 77 movq %r9, 1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81 78
82 xorl %r9d,%r9d 79 movl %ecx, %eax
83 movq %rcx,%r12 80 movl %edx, %ecx
84 81
85 shrq $6,%r12 82 xorl %r9d, %r9d
86 jz .Lhandle_tail /* < 64 */ 83 movq %rcx, %r12
84
85 shrq $6, %r12
86 jz .Lhandle_tail /* < 64 */
87 87
88 clc 88 clc
89 89
90 /* main loop. clear in 64 byte blocks */ 90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ 91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */ 92 /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
94 .p2align 4 94 .p2align 4
95.Lloop: 95.Lloop:
96 source 96 source
97 movq (%rdi),%rbx 97 movq (%rdi), %rbx
98 source 98 source
99 movq 8(%rdi),%r8 99 movq 8(%rdi), %r8
100 source 100 source
101 movq 16(%rdi),%r11 101 movq 16(%rdi), %r11
102 source 102 source
103 movq 24(%rdi),%rdx 103 movq 24(%rdi), %rdx
104 104
105 source 105 source
106 movq 32(%rdi),%r10 106 movq 32(%rdi), %r10
107 source 107 source
108 movq 40(%rdi),%rbp 108 movq 40(%rdi), %rbp
109 source 109 source
110 movq 48(%rdi),%r14 110 movq 48(%rdi), %r14
111 source 111 source
112 movq 56(%rdi),%r13 112 movq 56(%rdi), %r13
113 113
114 ignore 2f 114 ignore 2f
115 prefetcht0 5*64(%rdi) 115 prefetcht0 5*64(%rdi)
1162: 1162:
117 adcq %rbx,%rax 117 adcq %rbx, %rax
118 adcq %r8,%rax 118 adcq %r8, %rax
119 adcq %r11,%rax 119 adcq %r11, %rax
120 adcq %rdx,%rax 120 adcq %rdx, %rax
121 adcq %r10,%rax 121 adcq %r10, %rax
122 adcq %rbp,%rax 122 adcq %rbp, %rax
123 adcq %r14,%rax 123 adcq %r14, %rax
124 adcq %r13,%rax 124 adcq %r13, %rax
125 125
126 decl %r12d 126 decl %r12d
127 127
128 dest 128 dest
129 movq %rbx,(%rsi) 129 movq %rbx, (%rsi)
130 dest 130 dest
131 movq %r8,8(%rsi) 131 movq %r8, 8(%rsi)
132 dest 132 dest
133 movq %r11,16(%rsi) 133 movq %r11, 16(%rsi)
134 dest 134 dest
135 movq %rdx,24(%rsi) 135 movq %rdx, 24(%rsi)
136 136
137 dest 137 dest
138 movq %r10,32(%rsi) 138 movq %r10, 32(%rsi)
139 dest 139 dest
140 movq %rbp,40(%rsi) 140 movq %rbp, 40(%rsi)
141 dest 141 dest
142 movq %r14,48(%rsi) 142 movq %r14, 48(%rsi)
143 dest 143 dest
144 movq %r13,56(%rsi) 144 movq %r13, 56(%rsi)
145 145
1463: 1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150 147
151 jnz .Lloop 148 leaq 64(%rdi), %rdi
149 leaq 64(%rsi), %rsi
152 150
153 adcq %r9,%rax 151 jnz .Lloop
154 152
155 /* do last upto 56 bytes */ 153 adcq %r9, %rax
154
155 /* do last up to 56 bytes */
156.Lhandle_tail: 156.Lhandle_tail:
157 /* ecx: count */ 157 /* ecx: count */
158 movl %ecx,%r10d 158 movl %ecx, %r10d
159 andl $63,%ecx 159 andl $63, %ecx
160 shrl $3,%ecx 160 shrl $3, %ecx
161 jz .Lfold 161 jz .Lfold
162 clc 162 clc
163 .p2align 4 163 .p2align 4
164.Lloop_8: 164.Lloop_8:
165 source 165 source
166 movq (%rdi),%rbx 166 movq (%rdi), %rbx
167 adcq %rbx,%rax 167 adcq %rbx, %rax
168 decl %ecx 168 decl %ecx
169 dest 169 dest
170 movq %rbx,(%rsi) 170 movq %rbx, (%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */ 171 leaq 8(%rsi), %rsi /* preserve carry */
172 leaq 8(%rdi),%rdi 172 leaq 8(%rdi), %rdi
173 jnz .Lloop_8 173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */ 174 adcq %r9, %rax /* add in carry */
175 175
176.Lfold: 176.Lfold:
177 /* reduce checksum to 32bits */ 177 /* reduce checksum to 32bits */
178 movl %eax,%ebx 178 movl %eax, %ebx
179 shrq $32,%rax 179 shrq $32, %rax
180 addl %ebx,%eax 180 addl %ebx, %eax
181 adcl %r9d,%eax 181 adcl %r9d, %eax
182 182
183 /* do last upto 6 bytes */ 183 /* do last up to 6 bytes */
184.Lhandle_7: 184.Lhandle_7:
185 movl %r10d,%ecx 185 movl %r10d, %ecx
186 andl $7,%ecx 186 andl $7, %ecx
187 shrl $1,%ecx 187 shrl $1, %ecx
188 jz .Lhandle_1 188 jz .Lhandle_1
189 movl $2,%edx 189 movl $2, %edx
190 xorl %ebx,%ebx 190 xorl %ebx, %ebx
191 clc 191 clc
192 .p2align 4 192 .p2align 4
193.Lloop_1: 193.Lloop_1:
194 source 194 source
195 movw (%rdi),%bx 195 movw (%rdi), %bx
196 adcl %ebx,%eax 196 adcl %ebx, %eax
197 decl %ecx 197 decl %ecx
198 dest 198 dest
199 movw %bx,(%rsi) 199 movw %bx, (%rsi)
200 leaq 2(%rdi),%rdi 200 leaq 2(%rdi), %rdi
201 leaq 2(%rsi),%rsi 201 leaq 2(%rsi), %rsi
202 jnz .Lloop_1 202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */ 203 adcl %r9d, %eax /* add in carry */
204 204
205 /* handle last odd byte */ 205 /* handle last odd byte */
206.Lhandle_1: 206.Lhandle_1:
207 testl $1,%r10d 207 testl $1, %r10d
208 jz .Lende 208 jz .Lende
209 xorl %ebx,%ebx 209 xorl %ebx, %ebx
210 source 210 source
211 movb (%rdi),%bl 211 movb (%rdi), %bl
212 dest 212 dest
213 movb %bl,(%rsi) 213 movb %bl, (%rsi)
214 addl %ebx,%eax 214 addl %ebx, %eax
215 adcl %r9d,%eax /* carry */ 215 adcl %r9d, %eax /* carry */
216 216
217 CFI_REMEMBER_STATE 217 CFI_REMEMBER_STATE
218.Lende: 218.Lende:
219 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp), %rbx
220 CFI_RESTORE rbx 220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp), %r12
222 CFI_RESTORE r12 222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp), %r14
224 CFI_RESTORE r14 224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp), %r13
226 CFI_RESTORE r13 226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp), %rbp
228 CFI_RESTORE rbp 228 CFI_RESTORE rbp
229 addq $7*8,%rsp 229 addq $7*8, %rsp
230 CFI_ADJUST_CFA_OFFSET -7*8 230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret 231 ret
232 CFI_RESTORE_STATE 232 CFI_RESTORE_STATE
233 233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source: 235.Lbad_source:
236 movq (%rsp),%rax 236 movq (%rsp), %rax
237 testq %rax,%rax 237 testq %rax, %rax
238 jz .Lende 238 jz .Lende
239 movl $-EFAULT,(%rax) 239 movl $-EFAULT, (%rax)
240 jmp .Lende 240 jmp .Lende
241 241
242.Lbad_dest: 242.Lbad_dest:
243 movq 8(%rsp),%rax 243 movq 8(%rsp), %rax
244 testq %rax,%rax 244 testq %rax, %rax
245 jz .Lende 245 jz .Lende
246 movl $-EFAULT,(%rax) 246 movl $-EFAULT, (%rax)
247 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC 248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic) 249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
84 count64--; 84 count64--;
85 } 85 }
86 86
87 /* last upto 7 8byte blocks */ 87 /* last up to 7 8byte blocks */
88 count %= 8; 88 count %= 8;
89 while (count) { 89 while (count) {
90 asm("addq %1,%0\n\t" 90 asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..0ecb8433e5a8
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
12#undef memmove
13
14/*
15 * Implement memmove(). This can handle overlap between src and dst.
16 *
17 * Input:
18 * rdi: dest
19 * rsi: src
20 * rdx: count
21 *
22 * Output:
23 * rax: dest
24 */
25ENTRY(memmove)
26 CFI_STARTPROC
27 /* Handle more 32bytes in loop */
28 mov %rdi, %rax
29 cmp $0x20, %rdx
30 jb 1f
31
32 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi
34 jb 2f
35
36 /*
37 * movsq instruction have many startup latency
38 * so we handle small size by general register.
39 */
40 cmp $680, %rdx
41 jb 3f
42 /*
43 * movsq instruction is only good for aligned case.
44 */
45
46 cmpb %dil, %sil
47 je 4f
483:
49 sub $0x20, %rdx
50 /*
51 * We gobble 32byts forward in each loop.
52 */
535:
54 sub $0x20, %rdx
55 movq 0*8(%rsi), %r11
56 movq 1*8(%rsi), %r10
57 movq 2*8(%rsi), %r9
58 movq 3*8(%rsi), %r8
59 leaq 4*8(%rsi), %rsi
60
61 movq %r11, 0*8(%rdi)
62 movq %r10, 1*8(%rdi)
63 movq %r9, 2*8(%rdi)
64 movq %r8, 3*8(%rdi)
65 leaq 4*8(%rdi), %rdi
66 jae 5b
67 addq $0x20, %rdx
68 jmp 1f
69 /*
70 * Handle data forward by movsq.
71 */
72 .p2align 4
734:
74 movq %rdx, %rcx
75 movq -8(%rsi, %rdx), %r11
76 lea -8(%rdi, %rdx), %r10
77 shrq $3, %rcx
78 rep movsq
79 movq %r11, (%r10)
80 jmp 13f
81 /*
82 * Handle data backward by movsq.
83 */
84 .p2align 4
857:
86 movq %rdx, %rcx
87 movq (%rsi), %r11
88 movq %rdi, %r10
89 leaq -8(%rsi, %rdx), %rsi
90 leaq -8(%rdi, %rdx), %rdi
91 shrq $3, %rcx
92 std
93 rep movsq
94 cld
95 movq %r11, (%r10)
96 jmp 13f
97
98 /*
99 * Start to prepare for backward copy.
100 */
101 .p2align 4
1022:
103 cmp $680, %rdx
104 jb 6f
105 cmp %dil, %sil
106 je 7b
1076:
108 /*
109 * Calculate copy position to tail.
110 */
111 addq %rdx, %rsi
112 addq %rdx, %rdi
113 subq $0x20, %rdx
114 /*
115 * We gobble 32byts backward in each loop.
116 */
1178:
118 subq $0x20, %rdx
119 movq -1*8(%rsi), %r11
120 movq -2*8(%rsi), %r10
121 movq -3*8(%rsi), %r9
122 movq -4*8(%rsi), %r8
123 leaq -4*8(%rsi), %rsi
124
125 movq %r11, -1*8(%rdi)
126 movq %r10, -2*8(%rdi)
127 movq %r9, -3*8(%rdi)
128 movq %r8, -4*8(%rdi)
129 leaq -4*8(%rdi), %rdi
130 jae 8b
131 /*
132 * Calculate copy position to head.
133 */
134 addq $0x20, %rdx
135 subq %rdx, %rsi
136 subq %rdx, %rdi
1371:
138 cmpq $16, %rdx
139 jb 9f
140 /*
141 * Move data from 16 bytes to 31 bytes.
142 */
143 movq 0*8(%rsi), %r11
144 movq 1*8(%rsi), %r10
145 movq -2*8(%rsi, %rdx), %r9
146 movq -1*8(%rsi, %rdx), %r8
147 movq %r11, 0*8(%rdi)
148 movq %r10, 1*8(%rdi)
149 movq %r9, -2*8(%rdi, %rdx)
150 movq %r8, -1*8(%rdi, %rdx)
151 jmp 13f
152 .p2align 4
1539:
154 cmpq $8, %rdx
155 jb 10f
156 /*
157 * Move data from 8 bytes to 15 bytes.
158 */
159 movq 0*8(%rsi), %r11
160 movq -1*8(%rsi, %rdx), %r10
161 movq %r11, 0*8(%rdi)
162 movq %r10, -1*8(%rdi, %rdx)
163 jmp 13f
16410:
165 cmpq $4, %rdx
166 jb 11f
167 /*
168 * Move data from 4 bytes to 7 bytes.
169 */
170 movl (%rsi), %r11d
171 movl -4(%rsi, %rdx), %r10d
172 movl %r11d, (%rdi)
173 movl %r10d, -4(%rdi, %rdx)
174 jmp 13f
17511:
176 cmp $2, %rdx
177 jb 12f
178 /*
179 * Move data from 2 bytes to 3 bytes.
180 */
181 movw (%rsi), %r11w
182 movw -2(%rsi, %rdx), %r10w
183 movw %r11w, (%rdi)
184 movw %r10w, -2(%rdi, %rdx)
185 jmp 13f
18612:
187 cmp $1, %rdx
188 jb 13f
189 /*
190 * Move data for 1 byte.
191 */
192 movb (%rsi), %r11b
193 movb %r11b, (%rdi)
19413:
195 retq
196 CFI_ENDPROC
197ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b34..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void *dest, const void *src, size_t count)
10{
11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 char *ret;
13
14 __asm__ __volatile__(
15 /* Handle more 32bytes in loop */
16 "mov %2, %3\n\t"
17 "cmp $0x20, %0\n\t"
18 "jb 1f\n\t"
19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
191}
192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49df..67743977398b 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
23#include <asm/dwarf2.h> 23#include <asm/dwarf2.h>
24 24
25#define save_common_regs \ 25#define save_common_regs \
26 pushq %rdi; \ 26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq %rsi; \ 27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
28 pushq %rcx; \ 28 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
29 pushq %r8; \ 29 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
30 pushq %r9; \ 30 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
31 pushq %r10; \ 31 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
32 pushq %r11 32 pushq_cfi %r11; CFI_REL_OFFSET r11, 0
33 33
34#define restore_common_regs \ 34#define restore_common_regs \
35 popq %r11; \ 35 popq_cfi %r11; CFI_RESTORE r11; \
36 popq %r10; \ 36 popq_cfi %r10; CFI_RESTORE r10; \
37 popq %r9; \ 37 popq_cfi %r9; CFI_RESTORE r9; \
38 popq %r8; \ 38 popq_cfi %r8; CFI_RESTORE r8; \
39 popq %rcx; \ 39 popq_cfi %rcx; CFI_RESTORE rcx; \
40 popq %rsi; \ 40 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq %rdi 41 popq_cfi %rdi; CFI_RESTORE rdi
42 42
43/* Fix up special calling conventions */ 43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 44ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC
45 save_common_regs 46 save_common_regs
46 pushq %rdx 47 pushq_cfi %rdx
48 CFI_REL_OFFSET rdx, 0
47 movq %rax,%rdi 49 movq %rax,%rdi
48 call rwsem_down_read_failed 50 call rwsem_down_read_failed
49 popq %rdx 51 popq_cfi %rdx
52 CFI_RESTORE rdx
50 restore_common_regs 53 restore_common_regs
51 ret 54 ret
52 ENDPROC(call_rwsem_down_read_failed) 55 CFI_ENDPROC
56ENDPROC(call_rwsem_down_read_failed)
53 57
54ENTRY(call_rwsem_down_write_failed) 58ENTRY(call_rwsem_down_write_failed)
59 CFI_STARTPROC
55 save_common_regs 60 save_common_regs
56 movq %rax,%rdi 61 movq %rax,%rdi
57 call rwsem_down_write_failed 62 call rwsem_down_write_failed
58 restore_common_regs 63 restore_common_regs
59 ret 64 ret
60 ENDPROC(call_rwsem_down_write_failed) 65 CFI_ENDPROC
66ENDPROC(call_rwsem_down_write_failed)
61 67
62ENTRY(call_rwsem_wake) 68ENTRY(call_rwsem_wake)
69 CFI_STARTPROC
63 decl %edx /* do nothing if still outstanding active readers */ 70 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f 71 jnz 1f
65 save_common_regs 72 save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
67 call rwsem_wake 74 call rwsem_wake
68 restore_common_regs 75 restore_common_regs
691: ret 761: ret
70 ENDPROC(call_rwsem_wake) 77 CFI_ENDPROC
78ENDPROC(call_rwsem_wake)
71 79
72/* Fix up special calling conventions */ 80/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake) 81ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC
74 save_common_regs 83 save_common_regs
75 pushq %rdx 84 pushq_cfi %rdx
85 CFI_REL_OFFSET rdx, 0
76 movq %rax,%rdi 86 movq %rax,%rdi
77 call rwsem_downgrade_wake 87 call rwsem_downgrade_wake
78 popq %rdx 88 popq_cfi %rdx
89 CFI_RESTORE rdx
79 restore_common_regs 90 restore_common_regs
80 ret 91 ret
81 ENDPROC(call_rwsem_downgrade_wake) 92 CFI_ENDPROC
93ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe4741782..06691daa4108 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
36 */ 36 */
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed) 38ENTRY(__write_lock_failed)
39 CFI_STARTPROC simple 39 CFI_STARTPROC
40 FRAME 40 FRAME
412: LOCK_PREFIX 412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax) 42 addl $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
74/* Fix up special calling conventions */ 74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed) 75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC 76 CFI_STARTPROC
77 push %ecx 77 pushl_cfi %ecx
78 CFI_ADJUST_CFA_OFFSET 4
79 CFI_REL_OFFSET ecx,0 78 CFI_REL_OFFSET ecx,0
80 push %edx 79 pushl_cfi %edx
81 CFI_ADJUST_CFA_OFFSET 4
82 CFI_REL_OFFSET edx,0 80 CFI_REL_OFFSET edx,0
83 call rwsem_down_read_failed 81 call rwsem_down_read_failed
84 pop %edx 82 popl_cfi %edx
85 CFI_ADJUST_CFA_OFFSET -4 83 popl_cfi %ecx
86 pop %ecx
87 CFI_ADJUST_CFA_OFFSET -4
88 ret 84 ret
89 CFI_ENDPROC 85 CFI_ENDPROC
90 ENDPROC(call_rwsem_down_read_failed) 86 ENDPROC(call_rwsem_down_read_failed)
91 87
92ENTRY(call_rwsem_down_write_failed) 88ENTRY(call_rwsem_down_write_failed)
93 CFI_STARTPROC 89 CFI_STARTPROC
94 push %ecx 90 pushl_cfi %ecx
95 CFI_ADJUST_CFA_OFFSET 4
96 CFI_REL_OFFSET ecx,0 91 CFI_REL_OFFSET ecx,0
97 calll rwsem_down_write_failed 92 calll rwsem_down_write_failed
98 pop %ecx 93 popl_cfi %ecx
99 CFI_ADJUST_CFA_OFFSET -4
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102 ENDPROC(call_rwsem_down_write_failed) 96 ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
105 CFI_STARTPROC 99 CFI_STARTPROC
106 decw %dx /* do nothing if still outstanding active readers */ 100 decw %dx /* do nothing if still outstanding active readers */
107 jnz 1f 101 jnz 1f
108 push %ecx 102 pushl_cfi %ecx
109 CFI_ADJUST_CFA_OFFSET 4
110 CFI_REL_OFFSET ecx,0 103 CFI_REL_OFFSET ecx,0
111 call rwsem_wake 104 call rwsem_wake
112 pop %ecx 105 popl_cfi %ecx
113 CFI_ADJUST_CFA_OFFSET -4
1141: ret 1061: ret
115 CFI_ENDPROC 107 CFI_ENDPROC
116 ENDPROC(call_rwsem_wake) 108 ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
118/* Fix up special calling conventions */ 110/* Fix up special calling conventions */
119ENTRY(call_rwsem_downgrade_wake) 111ENTRY(call_rwsem_downgrade_wake)
120 CFI_STARTPROC 112 CFI_STARTPROC
121 push %ecx 113 pushl_cfi %ecx
122 CFI_ADJUST_CFA_OFFSET 4
123 CFI_REL_OFFSET ecx,0 114 CFI_REL_OFFSET ecx,0
124 push %edx 115 pushl_cfi %edx
125 CFI_ADJUST_CFA_OFFSET 4
126 CFI_REL_OFFSET edx,0 116 CFI_REL_OFFSET edx,0
127 call rwsem_downgrade_wake 117 call rwsem_downgrade_wake
128 pop %edx 118 popl_cfi %edx
129 CFI_ADJUST_CFA_OFFSET -4 119 popl_cfi %ecx
130 pop %ecx
131 CFI_ADJUST_CFA_OFFSET -4
132 ret 120 ret
133 CFI_ENDPROC 121 CFI_ENDPROC
134 ENDPROC(call_rwsem_downgrade_wake) 122 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ecc..2930ae05d773 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
7 7
8 #include <linux/linkage.h> 8 #include <linux/linkage.h>
9 9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS 10#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */ 11 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func 12 .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a5428..782b082c9ff7 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
22 CFI_ENDPROC 22 CFI_ENDPROC
23 .endm 23 .endm
24 24
25 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
26 .macro thunk_retrax name,func
27 .globl \name
28\name:
29 CFI_STARTPROC
30 SAVE_ARGS
31 call \func
32 jmp restore_norax
33 CFI_ENDPROC
34 .endm
35
36
37 .section .sched.text, "ax"
38#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
39 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
40 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
41 thunk rwsem_wake_thunk,rwsem_wake
42 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
43#endif
44
45#ifdef CONFIG_TRACE_IRQFLAGS 25#ifdef CONFIG_TRACE_IRQFLAGS
46 /* put return address in rdi (arg1) */ 26 /* put return address in rdi (arg1) */
47 .macro thunk_ra name,func 27 .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
72 RESTORE_ARGS 52 RESTORE_ARGS
73 ret 53 ret
74 CFI_ENDPROC 54 CFI_ENDPROC
75
76 CFI_STARTPROC
77 SAVE_ARGS
78restore_norax:
79 RESTORE_ARGS 1
80 ret
81 CFI_ENDPROC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8]; 29static unsigned char __initdata nodeids[8];
31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
32 30
33static __init int find_northbridge(void) 31static __init int find_northbridge(void)
34{ 32{
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
51 return num; 49 return num;
52 } 50 }
53 51
54 return -1; 52 return -ENOENT;
55} 53}
56 54
57static __init void early_get_boot_cpu_id(void) 55static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
69#endif 67#endif
70} 68}
71 69
72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 70int __init amd_numa_init(void)
73{ 71{
74 unsigned long start = PFN_PHYS(start_pfn); 72 unsigned long start = PFN_PHYS(0);
75 unsigned long end = PFN_PHYS(end_pfn); 73 unsigned long end = PFN_PHYS(max_pfn);
76 unsigned numnodes; 74 unsigned numnodes;
77 unsigned long prevbase; 75 unsigned long prevbase;
78 int i, nb, found = 0; 76 int i, j, nb;
79 u32 nodeid, reg; 77 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base;
80 79
81 if (!early_pci_allowed()) 80 if (!early_pci_allowed())
82 return -1; 81 return -EINVAL;
83 82
84 nb = find_northbridge(); 83 nb = find_northbridge();
85 if (nb < 0) 84 if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
90 reg = read_pci_config(0, nb, 0, 0x60); 89 reg = read_pci_config(0, nb, 0, 0x60);
91 numnodes = ((reg >> 4) & 0xF) + 1; 90 numnodes = ((reg >> 4) & 0xF) + 1;
92 if (numnodes <= 1) 91 if (numnodes <= 1)
93 return -1; 92 return -ENOENT;
94 93
95 pr_info("Number of physical nodes %d\n", numnodes); 94 pr_info("Number of physical nodes %d\n", numnodes);
96 95
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
121 if ((base >> 8) & 3 || (limit >> 8) & 3) { 120 if ((base >> 8) & 3 || (limit >> 8) & 3) {
122 pr_err("Node %d using interleaving mode %lx/%lx\n", 121 pr_err("Node %d using interleaving mode %lx/%lx\n",
123 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 122 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
124 return -1; 123 return -EINVAL;
125 } 124 }
126 if (node_isset(nodeid, nodes_parsed)) { 125 if (node_isset(nodeid, numa_nodes_parsed)) {
127 pr_info("Node %d already present, skipping\n", 126 pr_info("Node %d already present, skipping\n",
128 nodeid); 127 nodeid);
129 continue; 128 continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
160 if (prevbase > base) { 159 if (prevbase > base) {
161 pr_err("Node map not sorted %lx,%lx\n", 160 pr_err("Node map not sorted %lx,%lx\n",
162 prevbase, base); 161 prevbase, base);
163 return -1; 162 return -EINVAL;
164 } 163 }
165 164
166 pr_info("Node %d MemBase %016lx Limit %016lx\n", 165 pr_info("Node %d MemBase %016lx Limit %016lx\n",
167 nodeid, base, limit); 166 nodeid, base, limit);
168 167
169 found++;
170
171 nodes[nodeid].start = base;
172 nodes[nodeid].end = limit;
173
174 prevbase = base; 168 prevbase = base;
175 169 numa_add_memblk(nodeid, base, limit);
176 node_set(nodeid, nodes_parsed); 170 node_set(nodeid, numa_nodes_parsed);
177 }
178
179 if (!found)
180 return -1;
181 return 0;
182}
183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 } 171 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211 172
212/* 173 if (!nodes_weight(numa_nodes_parsed))
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 174 return -ENOENT;
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226 175
176 /*
177 * We seem to have valid NUMA configuration. Map apicids to nodes
178 * using the coreid bits from early_identify_cpu.
179 */
227 bits = boot_cpu_data.x86_coreid_bits; 180 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits; 181 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
255{
256 unsigned int bits;
257 unsigned int cores;
258 unsigned int apicid_base;
259 int i;
260
261 BUG_ON(nodes_empty(nodes_parsed));
262 node_possible_map = nodes_parsed;
263 memnode_shift = compute_hash_shift(nodes, 8, NULL);
264 if (memnode_shift < 0) {
265 pr_err("No NUMA node hash function found. Contact maintainer\n");
266 return -1;
267 }
268 pr_info("Using node hash shift of %d\n", memnode_shift);
269
270 /* use the coreid bits from early_identify_cpu */
271 bits = boot_cpu_data.x86_coreid_bits;
272 cores = (1<<bits);
273 apicid_base = 0; 182 apicid_base = 0;
183
274 /* get the APIC ID of the BSP early for systems with apicid lifting */ 184 /* get the APIC ID of the BSP early for systems with apicid lifting */
275 early_get_boot_cpu_id(); 185 early_get_boot_cpu_id();
276 if (boot_cpu_physical_apicid > 0) { 186 if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
278 apicid_base = boot_cpu_physical_apicid; 188 apicid_base = boot_cpu_physical_apicid;
279 } 189 }
280 190
281 for_each_node_mask(i, node_possible_map) { 191 for_each_node_mask(i, numa_nodes_parsed)
282 int j;
283
284 memblock_x86_register_active_regions(i,
285 nodes[i].start >> PAGE_SHIFT,
286 nodes[i].end >> PAGE_SHIFT);
287 for (j = apicid_base; j < cores + apicid_base; j++) 192 for (j = apicid_base; j < cores + apicid_base; j++)
288 apicid_to_node[(i << bits) + j] = i; 193 set_apicid_to_node((i << bits) + j, i);
289 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
290 }
291 194
292 numa_init_array();
293 return 0; 195 return 0;
294} 196}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb882a4..20e3f8702d1e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,15 +229,14 @@ void vmalloc_sync_all(void)
229 for (address = VMALLOC_START & PMD_MASK; 229 for (address = VMALLOC_START & PMD_MASK;
230 address >= TASK_SIZE && address < FIXADDR_TOP; 230 address >= TASK_SIZE && address < FIXADDR_TOP;
231 address += PMD_SIZE) { 231 address += PMD_SIZE) {
232
233 unsigned long flags;
234 struct page *page; 232 struct page *page;
235 233
236 spin_lock_irqsave(&pgd_lock, flags); 234 spin_lock(&pgd_lock);
237 list_for_each_entry(page, &pgd_list, lru) { 235 list_for_each_entry(page, &pgd_list, lru) {
238 spinlock_t *pgt_lock; 236 spinlock_t *pgt_lock;
239 pmd_t *ret; 237 pmd_t *ret;
240 238
239 /* the pgt_lock only for Xen */
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 240 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242 241
243 spin_lock(pgt_lock); 242 spin_lock(pgt_lock);
@@ -247,7 +246,7 @@ void vmalloc_sync_all(void)
247 if (!ret) 246 if (!ret)
248 break; 247 break;
249 } 248 }
250 spin_unlock_irqrestore(&pgd_lock, flags); 249 spin_unlock(&pgd_lock);
251 } 250 }
252} 251}
253 252
@@ -828,6 +827,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
828 unsigned long address, unsigned int fault) 827 unsigned long address, unsigned int fault)
829{ 828{
830 if (fault & VM_FAULT_OOM) { 829 if (fault & VM_FAULT_OOM) {
830 /* Kernel mode? Handle exceptions or die: */
831 if (!(error_code & PF_USER)) {
832 up_read(&current->mm->mmap_sem);
833 no_context(regs, error_code, address);
834 return;
835 }
836
831 out_of_memory(regs, error_code, address); 837 out_of_memory(regs, error_code, address);
832 } else { 838 } else {
833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 839 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..d4203988504a 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20 20
21unsigned long __initdata e820_table_start; 21unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata e820_table_end; 22unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata e820_table_top; 23unsigned long __meminitdata pgt_buf_top;
24 24
25int after_bootmem; 25int after_bootmem;
26 26
@@ -33,7 +33,7 @@ int direct_gbpages
33static void __init find_early_table_space(unsigned long end, int use_pse, 33static void __init find_early_table_space(unsigned long end, int use_pse,
34 int use_gbpages) 34 int use_gbpages)
35{ 35{
36 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
37 phys_addr_t base; 37 phys_addr_t base;
38 38
39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
66 /* for fixmap */ 66 /* for fixmap */
67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
68#endif
69 68
70 /* 69 good_end = max_pfn_mapped << PAGE_SHIFT;
71 * RED-PEN putting page tables only on node 0 could
72 * cause a hotspot and fill up ZONE_DMA. The page tables
73 * need roughly 0.5KB per GB.
74 */
75#ifdef CONFIG_X86_32
76 start = 0x7000;
77#else
78 start = 0x8000;
79#endif 70#endif
80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, 71
81 tables, PAGE_SIZE); 72 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
82 if (base == MEMBLOCK_ERROR) 73 if (base == MEMBLOCK_ERROR)
83 panic("Cannot find space for the kernel page tables"); 74 panic("Cannot find space for the kernel page tables");
84 75
85 e820_table_start = base >> PAGE_SHIFT; 76 pgt_buf_start = base >> PAGE_SHIFT;
86 e820_table_end = e820_table_start; 77 pgt_buf_end = pgt_buf_start;
87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 78 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
88 79
89 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 80 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
90 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
91} 82}
92 83
93struct map_range { 84struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 load_cr3(swapper_pg_dir); 270 load_cr3(swapper_pg_dir);
280#endif 271#endif
281 272
282#ifdef CONFIG_X86_64
283 if (!after_bootmem && !start) {
284 pud_t *pud;
285 pmd_t *pmd;
286
287 mmu_cr4_features = read_cr4();
288
289 /*
290 * _brk_end cannot change anymore, but it and _end may be
291 * located on different 2M pages. cleanup_highmap(), however,
292 * can only consider _end when it runs, so destroy any
293 * mappings beyond _brk_end here.
294 */
295 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
296 pmd = pmd_offset(pud, _brk_end - 1);
297 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
298 pmd_clear(pmd);
299 }
300#endif
301 __flush_tlb_all(); 273 __flush_tlb_all();
302 274
303 if (!after_bootmem && e820_table_end > e820_table_start) 275 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, 276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
305 e820_table_end << PAGE_SHIFT, "PGTABLE"); 277 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
306 278
307 if (!after_bootmem) 279 if (!after_bootmem)
308 early_memtest(start, end); 280 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..80088f994193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
62 62
63static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
64{ 64{
65 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
66 void *adr; 66 void *adr;
67 67
68 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
69 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
70 70
71 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
166 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
167 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
168 pte_t *newpte; 168 pte_t *newpte;
169 int i; 169 int i;
170 170
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
644} 644}
645 645
646#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
647void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
648 int acpi, int k8)
649{ 648{
650#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
651 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
@@ -918,7 +917,7 @@ static void mark_nxdata_nx(void)
918{ 917{
919 /* 918 /*
920 * When this called, init has already been executed and released, 919 * When this called, init has already been executed and released,
921 * so everything past _etext sould be NX. 920 * so everything past _etext should be NX.
922 */ 921 */
923 unsigned long start = PFN_ALIGN(_etext); 922 unsigned long start = PFN_ALIGN(_etext);
924 /* 923 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a59296af80..794233587287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,8 @@
51#include <asm/numa.h> 51#include <asm/numa.h>
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/init.h> 53#include <asm/init.h>
54#include <asm/uv/uv.h>
55#include <asm/setup.h>
54 56
55static int __init parse_direct_gbpages_off(char *arg) 57static int __init parse_direct_gbpages_off(char *arg)
56{ 58{
@@ -105,18 +107,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
105 107
106 for (address = start; address <= end; address += PGDIR_SIZE) { 108 for (address = start; address <= end; address += PGDIR_SIZE) {
107 const pgd_t *pgd_ref = pgd_offset_k(address); 109 const pgd_t *pgd_ref = pgd_offset_k(address);
108 unsigned long flags;
109 struct page *page; 110 struct page *page;
110 111
111 if (pgd_none(*pgd_ref)) 112 if (pgd_none(*pgd_ref))
112 continue; 113 continue;
113 114
114 spin_lock_irqsave(&pgd_lock, flags); 115 spin_lock(&pgd_lock);
115 list_for_each_entry(page, &pgd_list, lru) { 116 list_for_each_entry(page, &pgd_list, lru) {
116 pgd_t *pgd; 117 pgd_t *pgd;
117 spinlock_t *pgt_lock; 118 spinlock_t *pgt_lock;
118 119
119 pgd = (pgd_t *)page_address(page) + pgd_index(address); 120 pgd = (pgd_t *)page_address(page) + pgd_index(address);
121 /* the pgt_lock only for Xen */
120 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 122 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
121 spin_lock(pgt_lock); 123 spin_lock(pgt_lock);
122 124
@@ -128,7 +130,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
128 130
129 spin_unlock(pgt_lock); 131 spin_unlock(pgt_lock);
130 } 132 }
131 spin_unlock_irqrestore(&pgd_lock, flags); 133 spin_unlock(&pgd_lock);
132 } 134 }
133} 135}
134 136
@@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
293 * to the compile time generated pmds. This results in invalid pmds up 295 * to the compile time generated pmds. This results in invalid pmds up
294 * to the point where we hit the physaddr 0 mapping. 296 * to the point where we hit the physaddr 0 mapping.
295 * 297 *
296 * We limit the mappings to the region from _text to _end. _end is 298 * We limit the mappings to the region from _text to _brk_end. _brk_end
297 * rounded up to the 2MB boundary. This catches the invalid pmds as 299 * is rounded up to the 2MB boundary. This catches the invalid pmds as
298 * well, as they are located before _text: 300 * well, as they are located before _text:
299 */ 301 */
300void __init cleanup_highmap(void) 302void __init cleanup_highmap(void)
301{ 303{
302 unsigned long vaddr = __START_KERNEL_map; 304 unsigned long vaddr = __START_KERNEL_map;
303 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
304 pmd_t *pmd = level2_kernel_pgt; 307 pmd_t *pmd = level2_kernel_pgt;
305 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
306 308
307 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
308 if (pmd_none(*pmd)) 310 if (pmd_none(*pmd))
309 continue; 311 continue;
310 if (vaddr < (unsigned long) _text || vaddr > end) 312 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -314,7 +316,7 @@ void __init cleanup_highmap(void)
314 316
315static __ref void *alloc_low_page(unsigned long *phys) 317static __ref void *alloc_low_page(unsigned long *phys)
316{ 318{
317 unsigned long pfn = e820_table_end++; 319 unsigned long pfn = pgt_buf_end++;
318 void *adr; 320 void *adr;
319 321
320 if (after_bootmem) { 322 if (after_bootmem) {
@@ -324,7 +326,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 return adr; 326 return adr;
325 } 327 }
326 328
327 if (pfn >= e820_table_top) 329 if (pfn >= pgt_buf_top)
328 panic("alloc_low_page: ran out of memory"); 330 panic("alloc_low_page: ran out of memory");
329 331
330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +335,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
333 return adr; 335 return adr;
334} 336}
335 337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
336static __ref void unmap_low_page(void *adr) 354static __ref void unmap_low_page(void *adr)
337{ 355{
338 if (after_bootmem) 356 if (after_bootmem)
339 return; 357 return;
340 358
341 early_iounmap(adr, PAGE_SIZE); 359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
342} 360}
343 361
344static unsigned long __meminit 362static unsigned long __meminit
@@ -386,15 +404,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
386} 404}
387 405
388static unsigned long __meminit 406static unsigned long __meminit
389phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
390 pgprot_t prot)
391{
392 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
393
394 return phys_pte_init(pte, address, end, prot);
395}
396
397static unsigned long __meminit
398phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 407phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
399 unsigned long page_size_mask, pgprot_t prot) 408 unsigned long page_size_mask, pgprot_t prot)
400{ 409{
@@ -420,8 +429,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
420 if (pmd_val(*pmd)) { 429 if (pmd_val(*pmd)) {
421 if (!pmd_large(*pmd)) { 430 if (!pmd_large(*pmd)) {
422 spin_lock(&init_mm.page_table_lock); 431 spin_lock(&init_mm.page_table_lock);
423 last_map_addr = phys_pte_update(pmd, address, 432 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
433 last_map_addr = phys_pte_init(pte, address,
424 end, prot); 434 end, prot);
435 unmap_low_page(pte);
425 spin_unlock(&init_mm.page_table_lock); 436 spin_unlock(&init_mm.page_table_lock);
426 continue; 437 continue;
427 } 438 }
@@ -468,18 +479,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
468} 479}
469 480
470static unsigned long __meminit 481static unsigned long __meminit
471phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
472 unsigned long page_size_mask, pgprot_t prot)
473{
474 pmd_t *pmd = pmd_offset(pud, 0);
475 unsigned long last_map_addr;
476
477 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
478 __flush_tlb_all();
479 return last_map_addr;
480}
481
482static unsigned long __meminit
483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 482phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
484 unsigned long page_size_mask) 483 unsigned long page_size_mask)
485{ 484{
@@ -504,8 +503,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
504 503
505 if (pud_val(*pud)) { 504 if (pud_val(*pud)) {
506 if (!pud_large(*pud)) { 505 if (!pud_large(*pud)) {
507 last_map_addr = phys_pmd_update(pud, addr, end, 506 pmd = map_low_page(pmd_offset(pud, 0));
507 last_map_addr = phys_pmd_init(pmd, addr, end,
508 page_size_mask, prot); 508 page_size_mask, prot);
509 unmap_low_page(pmd);
510 __flush_tlb_all();
509 continue; 511 continue;
510 } 512 }
511 /* 513 /*
@@ -553,17 +555,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
553 return last_map_addr; 555 return last_map_addr;
554} 556}
555 557
556static unsigned long __meminit
557phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
558 unsigned long page_size_mask)
559{
560 pud_t *pud;
561
562 pud = (pud_t *)pgd_page_vaddr(*pgd);
563
564 return phys_pud_init(pud, addr, end, page_size_mask);
565}
566
567unsigned long __meminit 558unsigned long __meminit
568kernel_physical_mapping_init(unsigned long start, 559kernel_physical_mapping_init(unsigned long start,
569 unsigned long end, 560 unsigned long end,
@@ -587,8 +578,10 @@ kernel_physical_mapping_init(unsigned long start,
587 next = end; 578 next = end;
588 579
589 if (pgd_val(*pgd)) { 580 if (pgd_val(*pgd)) {
590 last_map_addr = phys_pud_update(pgd, __pa(start), 581 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
582 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 583 __pa(end), page_size_mask);
584 unmap_low_page(pud);
592 continue; 585 continue;
593 } 586 }
594 587
@@ -612,10 +605,9 @@ kernel_physical_mapping_init(unsigned long start,
612} 605}
613 606
614#ifndef CONFIG_NUMA 607#ifndef CONFIG_NUMA
615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 608void __init initmem_init(void)
616 int acpi, int k8)
617{ 609{
618 memblock_x86_register_active_regions(0, start_pfn, end_pfn); 610 memblock_x86_register_active_regions(0, 0, max_pfn);
619} 611}
620#endif 612#endif
621 613
@@ -870,18 +862,18 @@ static struct vm_area_struct gate_vma = {
870 .vm_flags = VM_READ | VM_EXEC 862 .vm_flags = VM_READ | VM_EXEC
871}; 863};
872 864
873struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 865struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
874{ 866{
875#ifdef CONFIG_IA32_EMULATION 867#ifdef CONFIG_IA32_EMULATION
876 if (test_tsk_thread_flag(tsk, TIF_IA32)) 868 if (!mm || mm->context.ia32_compat)
877 return NULL; 869 return NULL;
878#endif 870#endif
879 return &gate_vma; 871 return &gate_vma;
880} 872}
881 873
882int in_gate_area(struct task_struct *task, unsigned long addr) 874int in_gate_area(struct mm_struct *mm, unsigned long addr)
883{ 875{
884 struct vm_area_struct *vma = get_gate_vma(task); 876 struct vm_area_struct *vma = get_gate_vma(mm);
885 877
886 if (!vma) 878 if (!vma)
887 return 0; 879 return 0;
@@ -890,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
890} 882}
891 883
892/* 884/*
893 * Use this when you have no reliable task/vma, typically from interrupt 885 * Use this when you have no reliable mm, typically from interrupt
894 * context. It is less reliable than using the task's vma and may give 886 * context. It is less reliable than using a task's mm and may give
895 * false positives: 887 * false positives.
896 */ 888 */
897int in_gate_area_no_task(unsigned long addr) 889int in_gate_area_no_mm(unsigned long addr)
898{ 890{
899 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); 891 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
900} 892}
@@ -908,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma)
908 return NULL; 900 return NULL;
909} 901}
910 902
903#ifdef CONFIG_X86_UV
904#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
905
906unsigned long memory_block_size_bytes(void)
907{
908 if (is_uv_system()) {
909 printk(KERN_INFO "UV: memory block size 2GB\n");
910 return 2UL * 1024 * 1024 * 1024;
911 }
912 return MIN_MEMORY_BLOCK_SIZE;
913}
914#endif
915
911#ifdef CONFIG_SPARSEMEM_VMEMMAP 916#ifdef CONFIG_SPARSEMEM_VMEMMAP
912/* 917/*
913 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 918 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..9559d360fde7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,14 +2,74 @@
2#include <linux/topology.h> 2#include <linux/topology.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <asm/numa.h>
6#include <asm/acpi.h>
7
8int __initdata numa_off;
9
10static __init int numa_setup(char *opt)
11{
12 if (!opt)
13 return -EINVAL;
14 if (!strncmp(opt, "off", 3))
15 numa_off = 1;
16#ifdef CONFIG_NUMA_EMU
17 if (!strncmp(opt, "fake=", 5))
18 numa_emu_cmdline(opt + 5);
19#endif
20#ifdef CONFIG_ACPI_NUMA
21 if (!strncmp(opt, "noacpi", 6))
22 acpi_numa = -1;
23#endif
24 return 0;
25}
26early_param("numa", numa_setup);
5 27
6/* 28/*
7 * Which logical CPUs are on which nodes 29 * apicid, cpu, node mappings
8 */ 30 */
31s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33};
34
9cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
10EXPORT_SYMBOL(node_to_cpumask_map); 36EXPORT_SYMBOL(node_to_cpumask_map);
11 37
12/* 38/*
39 * Map cpu index to node index
40 */
41DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
42EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44void __cpuinit numa_set_node(int cpu, int node)
45{
46 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
47
48 /* early setting, no percpu area yet */
49 if (cpu_to_node_map) {
50 cpu_to_node_map[cpu] = node;
51 return;
52 }
53
54#ifdef CONFIG_DEBUG_PER_CPU_MAPS
55 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
56 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
57 dump_stack();
58 return;
59 }
60#endif
61 per_cpu(x86_cpu_to_node_map, cpu) = node;
62
63 if (node != NUMA_NO_NODE)
64 set_cpu_numa_node(cpu, node);
65}
66
67void __cpuinit numa_clear_node(int cpu)
68{
69 numa_set_node(cpu, NUMA_NO_NODE);
70}
71
72/*
13 * Allocate node_to_cpumask_map based on number of available nodes 73 * Allocate node_to_cpumask_map based on number of available nodes
14 * Requires node_possible_map to be valid. 74 * Requires node_possible_map to be valid.
15 * 75 *
@@ -35,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
35 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
36} 96}
37 97
38#ifdef CONFIG_DEBUG_PER_CPU_MAPS 98/*
99 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
101 * mapping. To avoid this fill in the mapping for all possible CPUs,
102 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes.
104 */
105void __init numa_init_array(void)
106{
107 int rr, i;
108
109 rr = first_node(node_online_map);
110 for (i = 0; i < nr_cpu_ids; i++) {
111 if (early_cpu_to_node(i) != NUMA_NO_NODE)
112 continue;
113 numa_set_node(i, rr);
114 rr = next_node(rr, node_online_map);
115 if (rr == MAX_NUMNODES)
116 rr = first_node(node_online_map);
117 }
118}
119
120static __init int find_near_online_node(int node)
121{
122 int n, val;
123 int min_val = INT_MAX;
124 int best_node = -1;
125
126 for_each_online_node(n) {
127 val = node_distance(node, n);
128
129 if (val < min_val) {
130 min_val = val;
131 best_node = n;
132 }
133 }
134
135 return best_node;
136}
137
138/*
139 * Setup early cpu_to_node.
140 *
141 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
142 * and apicid_to_node[] tables have valid entries for a CPU.
143 * This means we skip cpu_to_node[] initialisation for NUMA
144 * emulation and faking node case (when running a kernel compiled
145 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
146 * is already initialized in a round robin manner at numa_init_array,
147 * prior to this call, and this initialization is good enough
148 * for the fake NUMA cases.
149 *
150 * Called before the per_cpu areas are setup.
151 */
152void __init init_cpu_to_node(void)
153{
154 int cpu;
155 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
156
157 BUG_ON(cpu_to_apicid == NULL);
158
159 for_each_possible_cpu(cpu) {
160 int node = numa_cpu_node(cpu);
161
162 if (node == NUMA_NO_NODE)
163 continue;
164 if (!node_online(node))
165 node = find_near_online_node(node);
166 numa_set_node(cpu, node);
167 }
168}
169
170#ifndef CONFIG_DEBUG_PER_CPU_MAPS
171
172# ifndef CONFIG_NUMA_EMU
173void __cpuinit numa_add_cpu(int cpu)
174{
175 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
176}
177
178void __cpuinit numa_remove_cpu(int cpu)
179{
180 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
181}
182# endif /* !CONFIG_NUMA_EMU */
183
184#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
185
186int __cpu_to_node(int cpu)
187{
188 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
189 printk(KERN_WARNING
190 "cpu_to_node(%d): usage too early!\n", cpu);
191 dump_stack();
192 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
193 }
194 return per_cpu(x86_cpu_to_node_map, cpu);
195}
196EXPORT_SYMBOL(__cpu_to_node);
197
198/*
199 * Same function as cpu_to_node() but used if called before the
200 * per_cpu areas are setup.
201 */
202int early_cpu_to_node(int cpu)
203{
204 if (early_per_cpu_ptr(x86_cpu_to_node_map))
205 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
206
207 if (!cpu_possible(cpu)) {
208 printk(KERN_WARNING
209 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
210 dump_stack();
211 return NUMA_NO_NODE;
212 }
213 return per_cpu(x86_cpu_to_node_map, cpu);
214}
215
216struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
217{
218 int node = early_cpu_to_node(cpu);
219 struct cpumask *mask;
220 char buf[64];
221
222 if (node == NUMA_NO_NODE) {
223 /* early_cpu_to_node() already emits a warning and trace */
224 return NULL;
225 }
226 mask = node_to_cpumask_map[node];
227 if (!mask) {
228 pr_err("node_to_cpumask_map[%i] NULL\n", node);
229 dump_stack();
230 return NULL;
231 }
232
233 cpulist_scnprintf(buf, sizeof(buf), mask);
234 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
235 enable ? "numa_add_cpu" : "numa_remove_cpu",
236 cpu, node, buf);
237 return mask;
238}
239
240# ifndef CONFIG_NUMA_EMU
241static void __cpuinit numa_set_cpumask(int cpu, int enable)
242{
243 struct cpumask *mask;
244
245 mask = debug_cpumask_set_cpu(cpu, enable);
246 if (!mask)
247 return;
248
249 if (enable)
250 cpumask_set_cpu(cpu, mask);
251 else
252 cpumask_clear_cpu(cpu, mask);
253}
254
255void __cpuinit numa_add_cpu(int cpu)
256{
257 numa_set_cpumask(cpu, 1);
258}
259
260void __cpuinit numa_remove_cpu(int cpu)
261{
262 numa_set_cpumask(cpu, 0);
263}
264# endif /* !CONFIG_NUMA_EMU */
265
39/* 266/*
40 * Returns a pointer to the bitmask of CPUs on Node 'node'. 267 * Returns a pointer to the bitmask of CPUs on Node 'node'.
41 */ 268 */
@@ -58,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
58 return node_to_cpumask_map[node]; 285 return node_to_cpumask_map[node];
59} 286}
60EXPORT_SYMBOL(cpumask_of_node); 287EXPORT_SYMBOL(cpumask_of_node);
61#endif 288
289#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
112static unsigned long kva_pages; 112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
113/* 119/*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 120 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 121 * a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
346 (ulong) node_remap_end_vaddr[nid]); 352 (ulong) node_remap_end_vaddr[nid]);
347} 353}
348 354
349void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 355void __init initmem_init(void)
350 int acpi, int k8)
351{ 356{
352 int nid; 357 int nid;
353 long kva_target_pfn; 358 long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
361 */ 366 */
362 367
363 get_memcfg_numa(); 368 get_memcfg_numa();
369 numa_init_array();
364 370
365 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
366 372
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1e72102e80c9..e8c00cc72033 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,32 +13,30 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/acpi.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/dma.h> 20#include <asm/dma.h>
20#include <asm/numa.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
23 23
24#include "numa_internal.h"
25
24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
25EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
26 28
27struct memnode memnode; 29nodemask_t numa_nodes_parsed __initdata;
28 30
29s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 31struct memnode memnode;
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31};
32 32
33int numa_off __initdata;
34static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
35static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
36 35
37/* 36static struct numa_meminfo numa_meminfo __initdata;
38 * Map cpu index to node index 37
39 */ 38static int numa_distance_cnt;
40DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 39static u8 *numa_distance;
41EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
42 40
43/* 41/*
44 * Given a shift value, try to populate memnodemap[] 42 * Given a shift value, try to populate memnodemap[]
@@ -47,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
47 * 0 if memnodmap[] too small (of shift too small) 45 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big) 46 * -1 if node overlap or lost ram (shift too big)
49 */ 47 */
50static int __init populate_memnodemap(const struct bootnode *nodes, 48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
51 int numnodes, int shift, int *nodeids)
52{ 49{
53 unsigned long addr, end; 50 unsigned long addr, end;
54 int i, res = -1; 51 int i, res = -1;
55 52
56 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
57 for (i = 0; i < numnodes; i++) { 54 for (i = 0; i < mi->nr_blks; i++) {
58 addr = nodes[i].start; 55 addr = mi->blk[i].start;
59 end = nodes[i].end; 56 end = mi->blk[i].end;
60 if (addr >= end) 57 if (addr >= end)
61 continue; 58 continue;
62 if ((end >> shift) >= memnodemapsize) 59 if ((end >> shift) >= memnodemapsize)
@@ -64,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
64 do { 61 do {
65 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
66 return -1; 63 return -1;
67 64 memnodemap[addr >> shift] = mi->blk[i].nid;
68 if (!nodeids)
69 memnodemap[addr >> shift] = i;
70 else
71 memnodemap[addr >> shift] = nodeids[i];
72
73 addr += (1UL << shift); 65 addr += (1UL << shift);
74 } while (addr < end); 66 } while (addr < end);
75 res = 1; 67 res = 1;
@@ -87,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
87 79
88 addr = 0x8000; 80 addr = 0x8000;
89 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
90 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
91 nodemap_size, L1_CACHE_BYTES); 83 nodemap_size, L1_CACHE_BYTES);
92 if (nodemap_addr == MEMBLOCK_ERROR) { 84 if (nodemap_addr == MEMBLOCK_ERROR) {
93 printk(KERN_ERR 85 printk(KERN_ERR
@@ -107,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
107 * The LSB of all start and end addresses in the node map is the value of the 99 * The LSB of all start and end addresses in the node map is the value of the
108 * maximum possible shift. 100 * maximum possible shift.
109 */ 101 */
110static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
111 int numnodes)
112{ 103{
113 int i, nodes_used = 0; 104 int i, nodes_used = 0;
114 unsigned long start, end; 105 unsigned long start, end;
115 unsigned long bitfield = 0, memtop = 0; 106 unsigned long bitfield = 0, memtop = 0;
116 107
117 for (i = 0; i < numnodes; i++) { 108 for (i = 0; i < mi->nr_blks; i++) {
118 start = nodes[i].start; 109 start = mi->blk[i].start;
119 end = nodes[i].end; 110 end = mi->blk[i].end;
120 if (start >= end) 111 if (start >= end)
121 continue; 112 continue;
122 bitfield |= start; 113 bitfield |= start;
@@ -132,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
132 return i; 123 return i;
133} 124}
134 125
135int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 126static int __init compute_hash_shift(const struct numa_meminfo *mi)
136 int *nodeids)
137{ 127{
138 int shift; 128 int shift;
139 129
140 shift = extract_lsb_from_nodes(nodes, numnodes); 130 shift = extract_lsb_from_nodes(mi);
141 if (allocate_cachealigned_memnodemap()) 131 if (allocate_cachealigned_memnodemap())
142 return -1; 132 return -1;
143 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
144 shift); 134 shift);
145 135
146 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 136 if (populate_memnodemap(mi, shift) != 1) {
147 printk(KERN_INFO "Your memory is not aligned you need to " 137 printk(KERN_INFO "Your memory is not aligned you need to "
148 "rebuild your kernel with a bigger NODEMAPSIZE " 138 "rebuild your kernel with a bigger NODEMAPSIZE "
149 "shift=%d\n", shift); 139 "shift=%d\n", shift);
@@ -189,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
189 return NULL; 179 return NULL;
190} 180}
191 181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
192/* Initialize bootmem allocator for a node */ 239/* Initialize bootmem allocator for a node */
193void __init 240void __init
194setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -235,709 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
235 node_set_online(nodeid); 282 node_set_online(nodeid);
236} 283}
237 284
238/* 285/**
239 * There are unfortunately some poorly designed mainboards around that 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
240 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 287 * @mi: numa_meminfo to clean up
241 * mapping. To avoid this fill in the mapping for all possible CPUs, 288 *
242 * as the number of CPUs is not known yet. We round robin the existing 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
243 * nodes. 290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
244 */ 294 */
245void __init numa_init_array(void) 295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
246{ 296{
247 int rr, i; 297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
248 300
249 rr = first_node(node_online_map); 301 for (i = 0; i < mi->nr_blks; i++) {
250 for (i = 0; i < nr_cpu_ids; i++) { 302 struct numa_memblk *bi = &mi->blk[i];
251 if (early_cpu_to_node(i) != NUMA_NO_NODE)
252 continue;
253 numa_set_node(i, rr);
254 rr = next_node(rr, node_online_map);
255 if (rr == MAX_NUMNODES)
256 rr = first_node(node_online_map);
257 }
258}
259
260#ifdef CONFIG_NUMA_EMU
261/* Numa emulation */
262static struct bootnode nodes[MAX_NUMNODES] __initdata;
263static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
264static char *cmdline __initdata;
265 303
266static int __init setup_physnodes(unsigned long start, unsigned long end, 304 /* make sure all blocks are inside the limits */
267 int acpi, int amd) 305 bi->start = max(bi->start, low);
268{ 306 bi->end = min(bi->end, high);
269 int ret = 0;
270 int i;
271 307
272 memset(physnodes, 0, sizeof(physnodes)); 308 /* and there's no empty block */
273#ifdef CONFIG_ACPI_NUMA 309 if (bi->start == bi->end) {
274 if (acpi) 310 numa_remove_memblk_from(i--, mi);
275 acpi_get_nodes(physnodes, start, end);
276#endif
277#ifdef CONFIG_AMD_NUMA
278 if (amd)
279 amd_get_nodes(physnodes);
280#endif
281 /*
282 * Basic sanity checking on the physical node map: there may be errors
283 * if the SRAT or AMD code incorrectly reported the topology or the mem=
284 * kernel parameter is used.
285 */
286 for (i = 0; i < MAX_NUMNODES; i++) {
287 if (physnodes[i].start == physnodes[i].end)
288 continue;
289 if (physnodes[i].start > end) {
290 physnodes[i].end = physnodes[i].start;
291 continue;
292 }
293 if (physnodes[i].end < start) {
294 physnodes[i].start = physnodes[i].end;
295 continue; 311 continue;
296 } 312 }
297 if (physnodes[i].start < start)
298 physnodes[i].start = start;
299 if (physnodes[i].end > end)
300 physnodes[i].end = end;
301 ret++;
302 }
303 313
304 /* 314 for (j = i + 1; j < mi->nr_blks; j++) {
305 * If no physical topology was detected, a single node is faked to cover 315 struct numa_memblk *bj = &mi->blk[j];
306 * the entire address space. 316 unsigned long start, end;
307 */
308 if (!ret) {
309 physnodes[ret].start = start;
310 physnodes[ret].end = end;
311 ret = 1;
312 }
313 return ret;
314}
315
316static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
317{
318 int i;
319
320 BUG_ON(acpi && amd);
321#ifdef CONFIG_ACPI_NUMA
322 if (acpi)
323 acpi_fake_nodes(nodes, nr_nodes);
324#endif
325#ifdef CONFIG_AMD_NUMA
326 if (amd)
327 amd_fake_nodes(nodes, nr_nodes);
328#endif
329 if (!acpi && !amd)
330 for (i = 0; i < nr_cpu_ids; i++)
331 numa_set_node(i, 0);
332}
333
334/*
335 * Setups up nid to range from addr to addr + size. If the end
336 * boundary is greater than max_addr, then max_addr is used instead.
337 * The return value is 0 if there is additional memory left for
338 * allocation past addr and -1 otherwise. addr is adjusted to be at
339 * the end of the node.
340 */
341static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
342{
343 int ret = 0;
344 nodes[nid].start = *addr;
345 *addr += size;
346 if (*addr >= max_addr) {
347 *addr = max_addr;
348 ret = -1;
349 }
350 nodes[nid].end = *addr;
351 node_set(nid, node_possible_map);
352 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
353 nodes[nid].start, nodes[nid].end,
354 (nodes[nid].end - nodes[nid].start) >> 20);
355 return ret;
356}
357
358/*
359 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
360 * to max_addr. The return value is the number of nodes allocated.
361 */
362static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
363{
364 nodemask_t physnode_mask = NODE_MASK_NONE;
365 u64 size;
366 int big;
367 int ret = 0;
368 int i;
369
370 if (nr_nodes <= 0)
371 return -1;
372 if (nr_nodes > MAX_NUMNODES) {
373 pr_info("numa=fake=%d too large, reducing to %d\n",
374 nr_nodes, MAX_NUMNODES);
375 nr_nodes = MAX_NUMNODES;
376 }
377
378 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
379 /*
380 * Calculate the number of big nodes that can be allocated as a result
381 * of consolidating the remainder.
382 */
383 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
384 FAKE_NODE_MIN_SIZE;
385
386 size &= FAKE_NODE_MIN_HASH_MASK;
387 if (!size) {
388 pr_err("Not enough memory for each node. "
389 "NUMA emulation disabled.\n");
390 return -1;
391 }
392
393 for (i = 0; i < MAX_NUMNODES; i++)
394 if (physnodes[i].start != physnodes[i].end)
395 node_set(i, physnode_mask);
396
397 /*
398 * Continue to fill physical nodes with fake nodes until there is no
399 * memory left on any of them.
400 */
401 while (nodes_weight(physnode_mask)) {
402 for_each_node_mask(i, physnode_mask) {
403 u64 end = physnodes[i].start + size;
404 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
405
406 if (ret < big)
407 end += FAKE_NODE_MIN_SIZE;
408 317
409 /* 318 /*
410 * Continue to add memory to this fake node if its 319 * See whether there are overlapping blocks. Whine
411 * non-reserved memory is less than the per-node size. 320 * about but allow overlaps of the same nid. They
321 * will be merged below.
412 */ 322 */
413 while (end - physnodes[i].start - 323 if (bi->end > bj->start && bi->start < bj->end) {
414 memblock_x86_hole_size(physnodes[i].start, end) < size) { 324 if (bi->nid != bj->nid) {
415 end += FAKE_NODE_MIN_SIZE; 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
416 if (end > physnodes[i].end) { 326 bi->nid, bi->start, bi->end,
417 end = physnodes[i].end; 327 bj->nid, bj->start, bj->end);
418 break; 328 return -EINVAL;
419 } 329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
420 } 333 }
421 334
422 /* 335 /*
423 * If there won't be at least FAKE_NODE_MIN_SIZE of 336 * Join together blocks on the same node, holes
424 * non-reserved memory in ZONE_DMA32 for the next node, 337 * between which don't overlap with memory on other
425 * this one must extend to the boundary. 338 * nodes.
426 */ 339 */
427 if (end < dma32_end && dma32_end - end - 340 if (bi->nid != bj->nid)
428 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 341 continue;
429 end = dma32_end; 342 start = max(min(bi->start, bj->start), low);
430 343 end = min(max(bi->end, bj->end), high);
431 /* 344 for (k = 0; k < mi->nr_blks; k++) {
432 * If there won't be enough non-reserved memory for the 345 struct numa_memblk *bk = &mi->blk[k];
433 * next node, this one must extend to the end of the 346
434 * physical node. 347 if (bi->nid == bk->nid)
435 */ 348 continue;
436 if (physnodes[i].end - end - 349 if (start < bk->end && end > bk->start)
437 memblock_x86_hole_size(end, physnodes[i].end) < size) 350 break;
438 end = physnodes[i].end; 351 }
439 352 if (k < mi->nr_blks)
440 /* 353 continue;
441 * Avoid allocating more nodes than requested, which can 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
442 * happen as a result of rounding down each node's size 355 bi->nid, bi->start, bi->end, bj->start, bj->end,
443 * to FAKE_NODE_MIN_SIZE. 356 start, end);
444 */ 357 bi->start = start;
445 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 358 bi->end = end;
446 end = physnodes[i].end; 359 numa_remove_memblk_from(j--, mi);
447
448 if (setup_node_range(ret++, &physnodes[i].start,
449 end - physnodes[i].start,
450 physnodes[i].end) < 0)
451 node_clear(i, physnode_mask);
452 } 360 }
453 } 361 }
454 return ret;
455}
456
457/*
458 * Returns the end address of a node so that there is at least `size' amount of
459 * non-reserved memory or `max_addr' is reached.
460 */
461static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
462{
463 u64 end = start + size;
464 362
465 while (end - start - memblock_x86_hole_size(start, end) < size) { 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
466 end += FAKE_NODE_MIN_SIZE; 364 mi->blk[i].start = mi->blk[i].end = 0;
467 if (end > max_addr) { 365 mi->blk[i].nid = NUMA_NO_NODE;
468 end = max_addr;
469 break;
470 }
471 } 366 }
472 return end; 367
368 return 0;
473} 369}
474 370
475/* 371/*
476 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 372 * Set nodes, which have memory in @mi, in *@nodemask.
477 * `addr' to `max_addr'. The return value is the number of nodes allocated.
478 */ 373 */
479static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
480{ 376{
481 nodemask_t physnode_mask = NODE_MASK_NONE;
482 u64 min_size;
483 int ret = 0;
484 int i; 377 int i;
485 378
486 if (!size) 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
487 return -1; 380 if (mi->blk[i].start != mi->blk[i].end &&
488 /* 381 mi->blk[i].nid != NUMA_NO_NODE)
489 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 382 node_set(mi->blk[i].nid, *nodemask);
490 * increased accordingly if the requested size is too small. This 383}
491 * creates a uniform distribution of node sizes across the entire
492 * machine (but not necessarily over physical nodes).
493 */
494 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
495 MAX_NUMNODES;
496 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
497 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
498 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
499 FAKE_NODE_MIN_HASH_MASK;
500 if (size < min_size) {
501 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
502 size >> 20, min_size >> 20);
503 size = min_size;
504 }
505 size &= FAKE_NODE_MIN_HASH_MASK;
506
507 for (i = 0; i < MAX_NUMNODES; i++)
508 if (physnodes[i].start != physnodes[i].end)
509 node_set(i, physnode_mask);
510 /*
511 * Fill physical nodes with fake nodes of size until there is no memory
512 * left on any of them.
513 */
514 while (nodes_weight(physnode_mask)) {
515 for_each_node_mask(i, physnode_mask) {
516 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
517 u64 end;
518
519 end = find_end_of_node(physnodes[i].start,
520 physnodes[i].end, size);
521 /*
522 * If there won't be at least FAKE_NODE_MIN_SIZE of
523 * non-reserved memory in ZONE_DMA32 for the next node,
524 * this one must extend to the boundary.
525 */
526 if (end < dma32_end && dma32_end - end -
527 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
528 end = dma32_end;
529 384
530 /* 385/**
531 * If there won't be enough non-reserved memory for the 386 * numa_reset_distance - Reset NUMA distance table
532 * next node, this one must extend to the end of the 387 *
533 * physical node. 388 * The current table is freed. The next numa_set_distance() call will
534 */ 389 * create a new one.
535 if (physnodes[i].end - end - 390 */
536 memblock_x86_hole_size(end, physnodes[i].end) < size) 391void __init numa_reset_distance(void)
537 end = physnodes[i].end; 392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
538 394
539 /* 395 /* numa_distance could be 1LU marking allocation failure, test cnt */
540 * Setup the fake node that will be allocated as bootmem 396 if (numa_distance_cnt)
541 * later. If setup_node_range() returns non-zero, there 397 memblock_x86_free_range(__pa(numa_distance),
542 * is no more memory available on this physical node. 398 __pa(numa_distance) + size);
543 */ 399 numa_distance_cnt = 0;
544 if (setup_node_range(ret++, &physnodes[i].start, 400 numa_distance = NULL; /* enable table creation */
545 end - physnodes[i].start,
546 physnodes[i].end) < 0)
547 node_clear(i, physnode_mask);
548 }
549 }
550 return ret;
551} 401}
552 402
553/* 403static int __init numa_alloc_distance(void)
554 * Sets up the system RAM area from start_pfn to last_pfn according to the
555 * numa=fake command-line option.
556 */
557static int __init numa_emulation(unsigned long start_pfn,
558 unsigned long last_pfn, int acpi, int amd)
559{ 404{
560 u64 addr = start_pfn << PAGE_SHIFT; 405 nodemask_t nodes_parsed;
561 u64 max_addr = last_pfn << PAGE_SHIFT; 406 size_t size;
562 int num_nodes; 407 int i, j, cnt = 0;
563 int i; 408 u64 phys;
564 409
565 /* 410 /* size the new table and allocate it */
566 * If the numa=fake command-line contains a 'M' or 'G', it represents 411 nodes_parsed = numa_nodes_parsed;
567 * the fixed node size. Otherwise, if it is just a single number N, 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
568 * split the system RAM into N fake nodes.
569 */
570 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
571 u64 size;
572 413
573 size = memparse(cmdline, &cmdline); 414 for_each_node_mask(i, nodes_parsed)
574 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 415 cnt = i;
575 } else { 416 cnt++;
576 unsigned long n; 417 size = cnt * cnt * sizeof(numa_distance[0]);
577 418
578 n = simple_strtoul(cmdline, NULL, 0); 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
579 num_nodes = split_nodes_interleave(addr, max_addr, n); 420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
580 } 426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
581 428
582 if (num_nodes < 0) 429 numa_distance = __va(phys);
583 return num_nodes; 430 numa_distance_cnt = cnt;
584 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 431
585 if (memnode_shift < 0) { 432 /* fill with the default distances */
586 memnode_shift = 0; 433 for (i = 0; i < cnt; i++)
587 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 434 for (j = 0; j < cnt; j++)
588 "disabled.\n"); 435 numa_distance[i * cnt + j] = i == j ?
589 return -1; 436 LOCAL_DISTANCE : REMOTE_DISTANCE;
590 } 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
591 438
592 /*
593 * We need to vacate all active ranges that may have been registered for
594 * the e820 memory map.
595 */
596 remove_all_active_ranges();
597 for_each_node_mask(i, node_possible_map) {
598 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
599 nodes[i].end >> PAGE_SHIFT);
600 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
601 }
602 setup_physnodes(addr, max_addr, acpi, amd);
603 fake_physnodes(acpi, amd, num_nodes);
604 numa_init_array();
605 return 0; 439 return 0;
606} 440}
607#endif /* CONFIG_NUMA_EMU */
608 441
609void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 442/**
610 int acpi, int amd) 443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
611{ 461{
612 int i; 462 if (!numa_distance && numa_alloc_distance() < 0)
613
614 nodes_clear(node_possible_map);
615 nodes_clear(node_online_map);
616
617#ifdef CONFIG_NUMA_EMU
618 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
619 acpi, amd);
620 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
621 return; 463 return;
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 nodes_clear(node_possible_map);
625 nodes_clear(node_online_map);
626#endif
627 464
628#ifdef CONFIG_ACPI_NUMA 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
629 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
630 last_pfn << PAGE_SHIFT)) 467 from, to, distance);
631 return; 468 return;
632 nodes_clear(node_possible_map); 469 }
633 nodes_clear(node_online_map);
634#endif
635 470
636#ifdef CONFIG_AMD_NUMA 471 if ((u8)distance != distance ||
637 if (!numa_off && amd && !amd_scan_nodes()) 472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
638 return; 475 return;
639 nodes_clear(node_possible_map); 476 }
640 nodes_clear(node_online_map);
641#endif
642 printk(KERN_INFO "%s\n",
643 numa_off ? "NUMA turned off" : "No NUMA configuration found");
644 477
645 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 478 numa_distance[from * numa_distance_cnt + to] = distance;
646 start_pfn << PAGE_SHIFT,
647 last_pfn << PAGE_SHIFT);
648 /* setup dummy node covering all memory */
649 memnode_shift = 63;
650 memnodemap = memnode.embedded_map;
651 memnodemap[0] = 0;
652 node_set_online(0);
653 node_set(0, node_possible_map);
654 for (i = 0; i < nr_cpu_ids; i++)
655 numa_set_node(i, 0);
656 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
657 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
658} 479}
659 480
660unsigned long __init numa_free_all_bootmem(void) 481int __node_distance(int from, int to)
661{ 482{
662 unsigned long pages = 0; 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
663 int i; 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
664 488
665 for_each_online_node(i) 489/*
666 pages += free_all_bootmem_node(NODE_DATA(i)); 490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
667 497
668 pages += free_all_memory_core_early(MAX_NUMNODES); 498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
669 507
670 return pages; 508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
671} 518}
672 519
673static __init int numa_setup(char *opt) 520static int __init numa_register_memblks(struct numa_meminfo *mi)
674{ 521{
675 if (!opt) 522 int i, nid;
523
524 /* Account for nodes with cpus and no memory */
525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
676 return -EINVAL; 528 return -EINVAL;
677 if (!strncmp(opt, "off", 3))
678 numa_off = 1;
679#ifdef CONFIG_NUMA_EMU
680 if (!strncmp(opt, "fake=", 5))
681 cmdline = opt + 5;
682#endif
683#ifdef CONFIG_ACPI_NUMA
684 if (!strncmp(opt, "noacpi", 6))
685 acpi_numa = -1;
686#endif
687 return 0;
688}
689early_param("numa", numa_setup);
690 529
691#ifdef CONFIG_NUMA 530 memnode_shift = compute_hash_shift(mi);
531 if (memnode_shift < 0) {
532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
692 535
693static __init int find_near_online_node(int node) 536 for (i = 0; i < mi->nr_blks; i++)
694{ 537 memblock_x86_register_active_regions(mi->blk[i].nid,
695 int n, val; 538 mi->blk[i].start >> PAGE_SHIFT,
696 int min_val = INT_MAX; 539 mi->blk[i].end >> PAGE_SHIFT);
697 int best_node = -1; 540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
698 545
699 for_each_online_node(n) { 546 /* Finally register nodes. */
700 val = node_distance(node, n); 547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
701 550
702 if (val < min_val) { 551 for (i = 0; i < mi->nr_blks; i++) {
703 min_val = val; 552 if (nid != mi->blk[i].nid)
704 best_node = n; 553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
705 } 556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
706 } 560 }
707 561
708 return best_node; 562 return 0;
709} 563}
710 564
711/* 565/**
712 * Setup early cpu_to_node. 566 * dummy_numma_init - Fallback dummy NUMA init
713 * 567 *
714 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 568 * Used if there's no underlying NUMA architecture, NUMA initialization
715 * and apicid_to_node[] tables have valid entries for a CPU. 569 * fails, or NUMA is disabled on the command line.
716 * This means we skip cpu_to_node[] initialisation for NUMA
717 * emulation and faking node case (when running a kernel compiled
718 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
719 * is already initialized in a round robin manner at numa_init_array,
720 * prior to this call, and this initialization is good enough
721 * for the fake NUMA cases.
722 * 570 *
723 * Called before the per_cpu areas are setup. 571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
724 */ 573 */
725void __init init_cpu_to_node(void) 574static int __init dummy_numa_init(void)
726{ 575{
727 int cpu; 576 printk(KERN_INFO "%s\n",
728 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
729 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
730 BUG_ON(cpu_to_apicid == NULL); 579 0LU, max_pfn << PAGE_SHIFT);
731 580
732 for_each_possible_cpu(cpu) { 581 node_set(0, numa_nodes_parsed);
733 int node; 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
734 u16 apicid = cpu_to_apicid[cpu];
735 583
736 if (apicid == BAD_APICID) 584 return 0;
737 continue;
738 node = apicid_to_node[apicid];
739 if (node == NUMA_NO_NODE)
740 continue;
741 if (!node_online(node))
742 node = find_near_online_node(node);
743 numa_set_node(cpu, node);
744 }
745} 585}
746#endif
747 586
748 587static int __init numa_init(int (*init_func)(void))
749void __cpuinit numa_set_node(int cpu, int node)
750{ 588{
751 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 589 int i;
752 590 int ret;
753 /* early setting, no percpu area yet */
754 if (cpu_to_node_map) {
755 cpu_to_node_map[cpu] = node;
756 return;
757 }
758
759#ifdef CONFIG_DEBUG_PER_CPU_MAPS
760 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
761 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
762 dump_stack();
763 return;
764 }
765#endif
766 per_cpu(x86_cpu_to_node_map, cpu) = node;
767 591
768 if (node != NUMA_NO_NODE) 592 for (i = 0; i < MAX_LOCAL_APIC; i++)
769 set_cpu_numa_node(cpu, node); 593 set_apicid_to_node(i, NUMA_NO_NODE);
770}
771 594
772void __cpuinit numa_clear_node(int cpu) 595 nodes_clear(numa_nodes_parsed);
773{ 596 nodes_clear(node_possible_map);
774 numa_set_node(cpu, NUMA_NO_NODE); 597 nodes_clear(node_online_map);
775} 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
776 599 remove_all_active_ranges();
777#ifndef CONFIG_DEBUG_PER_CPU_MAPS 600 numa_reset_distance();
778 601
779#ifndef CONFIG_NUMA_EMU 602 ret = init_func();
780void __cpuinit numa_add_cpu(int cpu) 603 if (ret < 0)
781{ 604 return ret;
782 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 605 ret = numa_cleanup_meminfo(&numa_meminfo);
783} 606 if (ret < 0)
607 return ret;
784 608
785void __cpuinit numa_remove_cpu(int cpu) 609 numa_emulation(&numa_meminfo, numa_distance_cnt);
786{
787 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
788}
789#else
790void __cpuinit numa_add_cpu(int cpu)
791{
792 unsigned long addr;
793 u16 apicid;
794 int physnid;
795 int nid = NUMA_NO_NODE;
796 610
797 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 611 ret = numa_register_memblks(&numa_meminfo);
798 if (apicid != BAD_APICID) 612 if (ret < 0)
799 nid = apicid_to_node[apicid]; 613 return ret;
800 if (nid == NUMA_NO_NODE)
801 nid = early_cpu_to_node(cpu);
802 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
803 614
804 /* 615 for (i = 0; i < nr_cpu_ids; i++) {
805 * Use the starting address of the emulated node to find which physical 616 int nid = early_cpu_to_node(i);
806 * node it is allocated on.
807 */
808 addr = node_start_pfn(nid) << PAGE_SHIFT;
809 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
810 if (addr >= physnodes[physnid].start &&
811 addr < physnodes[physnid].end)
812 break;
813 617
814 /* 618 if (nid == NUMA_NO_NODE)
815 * Map the cpu to each emulated node that is allocated on the physical 619 continue;
816 * node of the cpu's apic id. 620 if (!node_online(nid))
817 */ 621 numa_clear_node(i);
818 for_each_online_node(nid) {
819 addr = node_start_pfn(nid) << PAGE_SHIFT;
820 if (addr >= physnodes[physnid].start &&
821 addr < physnodes[physnid].end)
822 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
823 } 622 }
623 numa_init_array();
624 return 0;
824} 625}
825 626
826void __cpuinit numa_remove_cpu(int cpu) 627void __init initmem_init(void)
827{ 628{
828 int i; 629 int ret;
829 630
830 for_each_online_node(i) 631 if (!numa_off) {
831 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 632#ifdef CONFIG_ACPI_NUMA
832} 633 ret = numa_init(x86_acpi_numa_init);
833#endif /* !CONFIG_NUMA_EMU */ 634 if (!ret)
834 635 return;
835#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 636#endif
836static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 637#ifdef CONFIG_AMD_NUMA
837{ 638 ret = numa_init(amd_numa_init);
838 int node = early_cpu_to_node(cpu); 639 if (!ret)
839 struct cpumask *mask; 640 return;
840 char buf[64]; 641#endif
841
842 mask = node_to_cpumask_map[node];
843 if (!mask) {
844 pr_err("node_to_cpumask_map[%i] NULL\n", node);
845 dump_stack();
846 return NULL;
847 } 642 }
848 643
849 cpulist_scnprintf(buf, sizeof(buf), mask); 644 numa_init(dummy_numa_init);
850 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
851 enable ? "numa_add_cpu" : "numa_remove_cpu",
852 cpu, node, buf);
853 return mask;
854} 645}
855 646
856/* 647unsigned long __init numa_free_all_bootmem(void)
857 * --------- debug versions of the numa functions ---------
858 */
859#ifndef CONFIG_NUMA_EMU
860static void __cpuinit numa_set_cpumask(int cpu, int enable)
861{
862 struct cpumask *mask;
863
864 mask = debug_cpumask_set_cpu(cpu, enable);
865 if (!mask)
866 return;
867
868 if (enable)
869 cpumask_set_cpu(cpu, mask);
870 else
871 cpumask_clear_cpu(cpu, mask);
872}
873#else
874static void __cpuinit numa_set_cpumask(int cpu, int enable)
875{ 648{
876 int node = early_cpu_to_node(cpu); 649 unsigned long pages = 0;
877 struct cpumask *mask;
878 int i; 650 int i;
879 651
880 for_each_online_node(i) { 652 for_each_online_node(i)
881 unsigned long addr; 653 pages += free_all_bootmem_node(NODE_DATA(i));
882
883 addr = node_start_pfn(i) << PAGE_SHIFT;
884 if (addr < physnodes[node].start ||
885 addr >= physnodes[node].end)
886 continue;
887 mask = debug_cpumask_set_cpu(cpu, enable);
888 if (!mask)
889 return;
890
891 if (enable)
892 cpumask_set_cpu(cpu, mask);
893 else
894 cpumask_clear_cpu(cpu, mask);
895 }
896}
897#endif /* CONFIG_NUMA_EMU */
898 654
899void __cpuinit numa_add_cpu(int cpu) 655 pages += free_all_memory_core_early(MAX_NUMNODES);
900{
901 numa_set_cpumask(cpu, 1);
902}
903 656
904void __cpuinit numa_remove_cpu(int cpu) 657 return pages;
905{
906 numa_set_cpumask(cpu, 0);
907} 658}
908 659
909int __cpu_to_node(int cpu) 660int __cpuinit numa_cpu_node(int cpu)
910{ 661{
911 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
912 printk(KERN_WARNING
913 "cpu_to_node(%d): usage too early!\n", cpu);
914 dump_stack();
915 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
916 }
917 return per_cpu(x86_cpu_to_node_map, cpu);
918}
919EXPORT_SYMBOL(__cpu_to_node);
920 663
921/* 664 if (apicid != BAD_APICID)
922 * Same function as cpu_to_node() but used if called before the 665 return __apicid_to_node[apicid];
923 * per_cpu areas are setup. 666 return NUMA_NO_NODE;
924 */
925int early_cpu_to_node(int cpu)
926{
927 if (early_per_cpu_ptr(x86_cpu_to_node_map))
928 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
929
930 if (!cpu_possible(cpu)) {
931 printk(KERN_WARNING
932 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
933 dump_stack();
934 return NUMA_NO_NODE;
935 }
936 return per_cpu(x86_cpu_to_node_map, cpu);
937} 667}
938
939/*
940 * --------- end of debug versions of the numa functions ---------
941 */
942
943#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..ad091e4cff17
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/**
271 * numa_emulation - Emulate NUMA nodes
272 * @numa_meminfo: NUMA configuration to massage
273 * @numa_dist_cnt: The size of the physical NUMA distance table
274 *
275 * Emulate NUMA nodes according to the numa=fake kernel parameter.
276 * @numa_meminfo contains the physical memory configuration and is modified
277 * to reflect the emulated configuration on success. @numa_dist_cnt is
278 * used to determine the size of the physical distance table.
279 *
280 * On success, the following modifications are made.
281 *
282 * - @numa_meminfo is updated to reflect the emulated nodes.
283 *
284 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
285 * emulated nodes.
286 *
287 * - NUMA distance table is rebuilt to represent distances between emulated
288 * nodes. The distances are determined considering how emulated nodes
289 * are mapped to physical nodes and match the actual distances.
290 *
291 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
292 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
293 *
294 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
295 * identity mapping and no other modification is made.
296 */
297void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{
299 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT;
302 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid;
305 int i, j, ret;
306
307 if (!emu_cmdline)
308 goto no_emu;
309
310 memset(&ei, 0, sizeof(ei));
311 pi = *numa_meminfo;
312
313 for (i = 0; i < MAX_NUMNODES; i++)
314 emu_nid_to_phys[i] = NUMA_NO_NODE;
315
316 /*
317 * If the numa=fake command-line contains a 'M' or 'G', it represents
318 * the fixed node size. Otherwise, if it is just a single number N,
319 * split the system RAM into N fake nodes.
320 */
321 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
322 u64 size;
323
324 size = memparse(emu_cmdline, &emu_cmdline);
325 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
326 } else {
327 unsigned long n;
328
329 n = simple_strtoul(emu_cmdline, NULL, 0);
330 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
331 }
332
333 if (ret < 0)
334 goto no_emu;
335
336 if (numa_cleanup_meminfo(&ei) < 0) {
337 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
338 goto no_emu;
339 }
340
341 /* copy the physical distance table */
342 if (numa_dist_cnt) {
343 u64 phys;
344
345 phys = memblock_find_in_range(0,
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
350 goto no_emu;
351 }
352 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
353 phys_dist = __va(phys);
354
355 for (i = 0; i < numa_dist_cnt; i++)
356 for (j = 0; j < numa_dist_cnt; j++)
357 phys_dist[i * numa_dist_cnt + j] =
358 node_distance(i, j);
359 }
360
361 /*
362 * Determine the max emulated nid and the default phys nid to use
363 * for unmapped nodes.
364 */
365 max_emu_nid = 0;
366 dfl_phys_nid = NUMA_NO_NODE;
367 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
368 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
369 max_emu_nid = i;
370 if (dfl_phys_nid == NUMA_NO_NODE)
371 dfl_phys_nid = emu_nid_to_phys[i];
372 }
373 }
374 if (dfl_phys_nid == NUMA_NO_NODE) {
375 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
376 goto no_emu;
377 }
378
379 /* commit */
380 *numa_meminfo = ei;
381
382 /*
383 * Transform __apicid_to_node table to use emulated nids by
384 * reverse-mapping phys_nid. The maps should always exist but fall
385 * back to zero just in case.
386 */
387 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
388 if (__apicid_to_node[i] == NUMA_NO_NODE)
389 continue;
390 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
391 if (__apicid_to_node[i] == emu_nid_to_phys[j])
392 break;
393 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
394 }
395
396 /* make sure all emulated nodes are mapped to a physical node */
397 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
398 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
399 emu_nid_to_phys[i] = dfl_phys_nid;
400
401 /* transform distance table */
402 numa_reset_distance();
403 for (i = 0; i < max_emu_nid + 1; i++) {
404 for (j = 0; j < max_emu_nid + 1; j++) {
405 int physi = emu_nid_to_phys[i];
406 int physj = emu_nid_to_phys[j];
407 int dist;
408
409 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
410 dist = physi == physj ?
411 LOCAL_DISTANCE : REMOTE_DISTANCE;
412 else
413 dist = phys_dist[physi * numa_dist_cnt + physj];
414
415 numa_set_distance(i, j, dist);
416 }
417 }
418
419 /* free the copied physical distance table */
420 if (phys_dist)
421 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
422 return;
423
424no_emu:
425 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
426 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
427 emu_nid_to_phys[i] = i;
428}
429
430#ifndef CONFIG_DEBUG_PER_CPU_MAPS
431void __cpuinit numa_add_cpu(int cpu)
432{
433 int physnid, nid;
434
435 nid = early_cpu_to_node(cpu);
436 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
437
438 physnid = emu_nid_to_phys[nid];
439
440 /*
441 * Map the cpu to each emulated node that is allocated on the physical
442 * node of the cpu's apic id.
443 */
444 for_each_online_node(nid)
445 if (emu_nid_to_phys[nid] == physnid)
446 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
447}
448
449void __cpuinit numa_remove_cpu(int cpu)
450{
451 int i;
452
453 for_each_online_node(i)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, int enable)
458{
459 struct cpumask *mask;
460 int nid, physnid, i;
461
462 nid = early_cpu_to_node(cpu);
463 if (nid == NUMA_NO_NODE) {
464 /* early_cpu_to_node() already emits a warning and trace */
465 return;
466 }
467
468 physnid = emu_nid_to_phys[nid];
469
470 for_each_online_node(i) {
471 if (emu_nid_to_phys[nid] != physnid)
472 continue;
473
474 mask = debug_cpumask_set_cpu(cpu, enable);
475 if (!mask)
476 return;
477
478 if (enable)
479 cpumask_set_cpu(cpu, mask);
480 else
481 cpumask_clear_cpu(cpu, mask);
482 }
483}
484
485void __cpuinit numa_add_cpu(int cpu)
486{
487 numa_set_cpumask(cpu, 1);
488}
489
490void __cpuinit numa_remove_cpu(int cpu)
491{
492 numa_set_cpumask(cpu, 0);
493}
494#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca14ac4..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
57 57
58void update_page_count(int level, unsigned long pages) 58void update_page_count(int level, unsigned long pages)
59{ 59{
60 unsigned long flags;
61
62 /* Protect against CPA */ 60 /* Protect against CPA */
63 spin_lock_irqsave(&pgd_lock, flags); 61 spin_lock(&pgd_lock);
64 direct_pages_count[level] += pages; 62 direct_pages_count[level] += pages;
65 spin_unlock_irqrestore(&pgd_lock, flags); 63 spin_unlock(&pgd_lock);
66} 64}
67 65
68static void split_page_count(int level) 66static void split_page_count(int level)
@@ -256,7 +254,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
256 unsigned long pfn) 254 unsigned long pfn)
257{ 255{
258 pgprot_t forbidden = __pgprot(0); 256 pgprot_t forbidden = __pgprot(0);
259 pgprot_t required = __pgprot(0);
260 257
261 /* 258 /*
262 * The BIOS area between 640k and 1Mb needs to be executable for 259 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -282,12 +279,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
282 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 279 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
283 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 280 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
284 pgprot_val(forbidden) |= _PAGE_RW; 281 pgprot_val(forbidden) |= _PAGE_RW;
285 /*
286 * .data and .bss should always be writable.
287 */
288 if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
289 within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
290 pgprot_val(required) |= _PAGE_RW;
291 282
292#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
293 /* 284 /*
@@ -319,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
319 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
320 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
321 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
322 * aswell. 313 * as well.
323 */ 314 */
324 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
325 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
@@ -327,7 +318,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
327#endif 318#endif
328 319
329 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 320 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
330 prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
331 321
332 return prot; 322 return prot;
333} 323}
@@ -402,7 +392,7 @@ static int
402try_preserve_large_page(pte_t *kpte, unsigned long address, 392try_preserve_large_page(pte_t *kpte, unsigned long address,
403 struct cpa_data *cpa) 393 struct cpa_data *cpa)
404{ 394{
405 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 395 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
406 pte_t new_pte, old_pte, *tmp; 396 pte_t new_pte, old_pte, *tmp;
407 pgprot_t old_prot, new_prot, req_prot; 397 pgprot_t old_prot, new_prot, req_prot;
408 int i, do_split = 1; 398 int i, do_split = 1;
@@ -411,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
411 if (cpa->force_split) 401 if (cpa->force_split)
412 return 1; 402 return 1;
413 403
414 spin_lock_irqsave(&pgd_lock, flags); 404 spin_lock(&pgd_lock);
415 /* 405 /*
416 * Check for races, another CPU might have split this page 406 * Check for races, another CPU might have split this page
417 * up already: 407 * up already:
@@ -506,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
506 } 496 }
507 497
508out_unlock: 498out_unlock:
509 spin_unlock_irqrestore(&pgd_lock, flags); 499 spin_unlock(&pgd_lock);
510 500
511 return do_split; 501 return do_split;
512} 502}
513 503
514static int split_large_page(pte_t *kpte, unsigned long address) 504static int split_large_page(pte_t *kpte, unsigned long address)
515{ 505{
516 unsigned long flags, pfn, pfninc = 1; 506 unsigned long pfn, pfninc = 1;
517 unsigned int i, level; 507 unsigned int i, level;
518 pte_t *pbase, *tmp; 508 pte_t *pbase, *tmp;
519 pgprot_t ref_prot; 509 pgprot_t ref_prot;
@@ -527,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
527 if (!base) 517 if (!base)
528 return -ENOMEM; 518 return -ENOMEM;
529 519
530 spin_lock_irqsave(&pgd_lock, flags); 520 spin_lock(&pgd_lock);
531 /* 521 /*
532 * Check for races, another CPU might have split this page 522 * Check for races, another CPU might have split this page
533 * up for us already: 523 * up for us already:
@@ -599,7 +589,7 @@ out_unlock:
599 */ 589 */
600 if (base) 590 if (base)
601 __free_page(base); 591 __free_page(base);
602 spin_unlock_irqrestore(&pgd_lock, flags); 592 spin_unlock(&pgd_lock);
603 593
604 return 0; 594 return 0;
605} 595}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 500242d3c96d..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
121 121
122static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
123{ 123{
124 unsigned long flags; /* can be called from interrupt context */
125
126 if (SHARED_KERNEL_PMD) 124 if (SHARED_KERNEL_PMD)
127 return; 125 return;
128 126
129 spin_lock_irqsave(&pgd_lock, flags); 127 spin_lock(&pgd_lock);
130 pgd_list_del(pgd); 128 pgd_list_del(pgd);
131 spin_unlock_irqrestore(&pgd_lock, flags); 129 spin_unlock(&pgd_lock);
132} 130}
133 131
134/* 132/*
@@ -170,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
170 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
171 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
172 */ 170 */
173 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
174 write_cr3(read_cr3());
175} 172}
176#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
177 174
@@ -260,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
260{ 257{
261 pgd_t *pgd; 258 pgd_t *pgd;
262 pmd_t *pmds[PREALLOCATED_PMDS]; 259 pmd_t *pmds[PREALLOCATED_PMDS];
263 unsigned long flags;
264 260
265 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 261 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
266 262
@@ -280,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
280 * respect to anything walking the pgd_list, so that they 276 * respect to anything walking the pgd_list, so that they
281 * never see a partially populated pgd. 277 * never see a partially populated pgd.
282 */ 278 */
283 spin_lock_irqsave(&pgd_lock, flags); 279 spin_lock(&pgd_lock);
284 280
285 pgd_ctor(mm, pgd); 281 pgd_ctor(mm, pgd);
286 pgd_prepopulate_pmd(mm, pgd, pmds); 282 pgd_prepopulate_pmd(mm, pgd, pmds);
287 283
288 spin_unlock_irqrestore(&pgd_lock, flags); 284 spin_unlock(&pgd_lock);
289 285
290 return pgd; 286 return pgd;
291 287
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f16434568a51..364f36bdfad8 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,9 +57,8 @@ struct node_memory_chunk_s {
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; 57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58 58
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61 61
62int numa_off __initdata;
63int acpi_numa __initdata; 62int acpi_numa __initdata;
64 63
65static __init void bad_srat(void) 64static __init void bad_srat(void)
@@ -212,10 +211,12 @@ int __init get_memcfg_from_srat(void)
212{ 211{
213 int i, j, nid; 212 int i, j, nid;
214 213
215
216 if (srat_disabled()) 214 if (srat_disabled())
217 goto out_fail; 215 goto out_fail;
218 216
217 if (acpi_numa_init() < 0)
218 goto out_fail;
219
219 if (num_memory_chunks == 0) { 220 if (num_memory_chunks == 0) {
220 printk(KERN_DEBUG 221 printk(KERN_DEBUG
221 "could not find any ACPI SRAT memory areas.\n"); 222 "could not find any ACPI SRAT memory areas.\n");
@@ -255,8 +256,8 @@ int __init get_memcfg_from_srat(void)
255 printk(KERN_DEBUG "Number of memory chunks in system = %d\n", 256 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
256 num_memory_chunks); 257 num_memory_chunks);
257 258
258 for (i = 0; i < MAX_APICID; i++) 259 for (i = 0; i < MAX_LOCAL_APIC; i++)
259 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 260 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
260 261
261 for (j = 0; j < num_memory_chunks; j++){ 262 for (j = 0; j < num_memory_chunks; j++){
262 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 263 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
32static nodemask_t cpu_nodes_parsed __initdata;
33static struct bootnode nodes[MAX_NUMNODES] __initdata;
34static struct bootnode nodes_add[MAX_NUMNODES]; 29static struct bootnode nodes_add[MAX_NUMNODES];
35 30
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
40static __init int setup_node(int pxm) 31static __init int setup_node(int pxm)
41{ 32{
42 return acpi_map_pxm_to_node(pxm); 33 return acpi_map_pxm_to_node(pxm);
43} 34}
44 35
45static __init int conflicting_memblks(unsigned long start, unsigned long end)
46{
47 int i;
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
53 return memblk_nodeid[i];
54 if (nd->end == end && nd->start == start)
55 return memblk_nodeid[i];
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
62 struct bootnode *nd = &nodes[i];
63
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void) 36static __init void bad_srat(void)
77{ 37{
78 int i;
79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 38 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1; 39 acpi_numa = -1;
81 for (i = 0; i < MAX_LOCAL_APIC; i++) 40 memset(nodes_add, 0, sizeof(nodes_add));
82 apicid_to_node[i] = NUMA_NO_NODE;
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
87 remove_all_active_ranges();
88} 41}
89 42
90static __init inline int srat_disabled(void) 43static __init inline int srat_disabled(void)
91{ 44{
92 return numa_off || acpi_numa < 0; 45 return acpi_numa < 0;
93} 46}
94 47
95/* Callback for SLIT parsing */ 48/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 49void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{ 50{
98 unsigned length; 51 int i, j;
99 unsigned long phys;
100
101 length = slit->header.length;
102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
103 PAGE_SIZE);
104
105 if (phys == MEMBLOCK_ERROR)
106 panic(" Can not save slit!\n");
107 52
108 acpi_slit = __va(phys); 53 for (i = 0; i < slit->locality_count; i++)
109 memcpy(acpi_slit, slit, length); 54 for (j = 0; j < slit->locality_count; j++)
110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); 55 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
56 slit->entry[slit->locality_count * i + j]);
111} 57}
112 58
113/* Callback for Proximity Domain -> x2APIC mapping */ 59/* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 84 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return; 85 return;
140 } 86 }
141 apicid_to_node[apic_id] = node; 87 set_apicid_to_node(apic_id, node);
142 node_set(node, cpu_nodes_parsed); 88 node_set(node, numa_nodes_parsed);
143 acpi_numa = 1; 89 acpi_numa = 1;
144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 90 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
145 pxm, apic_id, node); 91 pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
178 return; 124 return;
179 } 125 }
180 126
181 apicid_to_node[apic_id] = node; 127 set_apicid_to_node(apic_id, node);
182 node_set(node, cpu_nodes_parsed); 128 node_set(node, numa_nodes_parsed);
183 acpi_numa = 1; 129 acpi_numa = 1;
184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 130 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
185 pxm, apic_id, node); 131 pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
241 } 187 }
242 188
243 if (changed) { 189 if (changed) {
244 node_set(node, cpu_nodes_parsed); 190 node_set(node, numa_nodes_parsed);
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end); 192 nd->start, nd->end);
247 } 193 }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
251void __init 197void __init
252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
253{ 199{
254 struct bootnode *nd, oldnode;
255 unsigned long start, end; 200 unsigned long start, end;
256 int node, pxm; 201 int node, pxm;
257 int i;
258 202
259 if (srat_disabled()) 203 if (srat_disabled())
260 return; 204 return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
276 bad_srat(); 220 bad_srat();
277 return; 221 return;
278 } 222 }
279 i = conflicting_memblks(start, end); 223
280 if (i == node) { 224 if (numa_add_memblk(node, start, end) < 0) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
285 printk(KERN_ERR
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
289 bad_srat(); 225 bad_srat();
290 return; 226 return;
291 } 227 }
292 nd = &nodes[node];
293 oldnode = *nd;
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
303 228
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end); 230 start, end);
306 231
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
308 update_nodes_add(node, start, end); 233 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
319}
320
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
323static int __init nodes_cover_memory(const struct bootnode *nodes)
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
333 pxmram -= __absent_pages_in_range(i, s, e);
334 if ((long)pxmram < 0)
335 pxmram = 0;
336 }
337
338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348} 234}
349 235
350void __init acpi_numa_arch_fixup(void) {} 236void __init acpi_numa_arch_fixup(void) {}
351 237
352#ifdef CONFIG_NUMA_EMU 238int __init x86_acpi_numa_init(void)
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
355{
356 int i;
357
358 for_each_node_mask(i, nodes_parsed) {
359 cutoff_node(i, start, end);
360 physnodes[i].start = nodes[i].start;
361 physnodes[i].end = nodes[i].end;
362 }
363}
364#endif /* CONFIG_NUMA_EMU */
365
366/* Use the information discovered above to actually set up the nodes. */
367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
368{ 239{
369 int i; 240 int ret;
370
371 if (acpi_numa <= 0)
372 return -1;
373
374 /* First clean up the node list */
375 for (i = 0; i < MAX_NUMNODES; i++)
376 cutoff_node(i, start, end);
377
378 /*
379 * Join together blocks on the same node, holes between
380 * which don't overlap with memory on other nodes.
381 */
382 for (i = 0; i < num_node_memblks; ++i) {
383 int j, k;
384
385 for (j = i + 1; j < num_node_memblks; ++j) {
386 unsigned long start, end;
387
388 if (memblk_nodeid[i] != memblk_nodeid[j])
389 continue;
390 start = min(node_memblk_range[i].end,
391 node_memblk_range[j].end);
392 end = max(node_memblk_range[i].start,
393 node_memblk_range[j].start);
394 for (k = 0; k < num_node_memblks; ++k) {
395 if (memblk_nodeid[i] == memblk_nodeid[k])
396 continue;
397 if (start < node_memblk_range[k].end &&
398 end > node_memblk_range[k].start)
399 break;
400 }
401 if (k < num_node_memblks)
402 continue;
403 start = min(node_memblk_range[i].start,
404 node_memblk_range[j].start);
405 end = max(node_memblk_range[i].end,
406 node_memblk_range[j].end);
407 printk(KERN_INFO "SRAT: Node %d "
408 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
409 memblk_nodeid[i],
410 node_memblk_range[i].start,
411 node_memblk_range[i].end,
412 node_memblk_range[j].start,
413 node_memblk_range[j].end,
414 start, end);
415 node_memblk_range[i].start = start;
416 node_memblk_range[i].end = end;
417 k = --num_node_memblks - j;
418 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
419 k * sizeof(*memblk_nodeid));
420 memmove(node_memblk_range + j, node_memblk_range + j+1,
421 k * sizeof(*node_memblk_range));
422 --j;
423 }
424 }
425
426 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
427 memblk_nodeid);
428 if (memnode_shift < 0) {
429 printk(KERN_ERR
430 "SRAT: No NUMA node hash function found. Contact maintainer\n");
431 bad_srat();
432 return -1;
433 }
434
435 for (i = 0; i < num_node_memblks; i++)
436 memblock_x86_register_active_regions(memblk_nodeid[i],
437 node_memblk_range[i].start >> PAGE_SHIFT,
438 node_memblk_range[i].end >> PAGE_SHIFT);
439
440 /* for out of order entries in SRAT */
441 sort_node_map();
442 if (!nodes_cover_memory(nodes)) {
443 bad_srat();
444 return -1;
445 }
446 241
447 /* Account for nodes with cpus and no memory */ 242 ret = acpi_numa_init();
448 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 243 if (ret < 0)
449 244 return ret;
450 /* Finally register nodes */ 245 return srat_disabled() ? -EINVAL : 0;
451 for_each_node_mask(i, node_possible_map)
452 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
453 /* Try again in case setup_node_bootmem missed one due
454 to missing bootmem */
455 for_each_node_mask(i, node_possible_map)
456 if (!node_online(i))
457 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
458
459 for (i = 0; i < nr_cpu_ids; i++) {
460 int node = early_cpu_to_node(i);
461
462 if (node == NUMA_NO_NODE)
463 continue;
464 if (!node_online(node))
465 numa_clear_node(i);
466 }
467 numa_init_array();
468 return 0;
469}
470
471#ifdef CONFIG_NUMA_EMU
472static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
473 [0 ... MAX_NUMNODES-1] = PXM_INVAL
474};
475static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
476 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
477};
478static int __init find_node_by_addr(unsigned long addr)
479{
480 int ret = NUMA_NO_NODE;
481 int i;
482
483 for_each_node_mask(i, nodes_parsed) {
484 /*
485 * Find the real node that this emulated node appears on. For
486 * the sake of simplicity, we only use a real node's starting
487 * address to determine which emulated node it appears on.
488 */
489 if (addr >= nodes[i].start && addr < nodes[i].end) {
490 ret = i;
491 break;
492 }
493 }
494 return ret;
495} 246}
496 247
497/*
498 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
499 * mappings that respect the real ACPI topology but reflect our emulated
500 * environment. For each emulated node, we find which real node it appears on
501 * and create PXM to NID mappings for those fake nodes which mirror that
502 * locality. SLIT will now represent the correct distances between emulated
503 * nodes as a result of the real topology.
504 */
505void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
506{
507 int i, j;
508
509 for (i = 0; i < num_nodes; i++) {
510 int nid, pxm;
511
512 nid = find_node_by_addr(fake_nodes[i].start);
513 if (nid == NUMA_NO_NODE)
514 continue;
515 pxm = node_to_pxm(nid);
516 if (pxm == PXM_INVAL)
517 continue;
518 fake_node_to_pxm_map[i] = pxm;
519 /*
520 * For each apicid_to_node mapping that exists for this real
521 * node, it must now point to the fake node ID.
522 */
523 for (j = 0; j < MAX_LOCAL_APIC; j++)
524 if (apicid_to_node[j] == nid &&
525 fake_apicid_to_node[j] == NUMA_NO_NODE)
526 fake_apicid_to_node[j] = i;
527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
539 for (i = 0; i < num_nodes; i++)
540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
542
543 nodes_clear(nodes_parsed);
544 for (i = 0; i < num_nodes; i++)
545 if (fake_nodes[i].start != fake_nodes[i].end)
546 node_set(i, nodes_parsed);
547}
548
549static int null_slit_node_compare(int a, int b)
550{
551 return node_to_pxm(a) == node_to_pxm(b);
552}
553#else
554static int null_slit_node_compare(int a, int b)
555{
556 return a == b;
557}
558#endif /* CONFIG_NUMA_EMU */
559
560int __node_distance(int a, int b)
561{
562 int index;
563
564 if (!acpi_slit)
565 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
566 REMOTE_DISTANCE;
567 index = acpi_slit->locality_count * node_to_pxm(a);
568 return acpi_slit->entry[index + node_to_pxm(b)];
569}
570
571EXPORT_SYMBOL(__node_distance);
572
573#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) 248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
574int memory_add_physaddr_to_nid(u64 start) 249int memory_add_physaddr_to_nid(u64 start)
575{ 250{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
179 sender = this_cpu_read(tlb_vector_offset); 179 sender = this_cpu_read(tlb_vector_offset);
180 f = &flush_state[sender]; 180 f = &flush_state[sender];
181 181
182 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
184 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
185 * probably not worth checking this for a cache-hot lock.
186 */
187 raw_spin_lock(&f->tlbstate_lock);
188 184
189 f->flush_mm = mm; 185 f->flush_mm = mm;
190 f->flush_va = va; 186 f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
202 198
203 f->flush_mm = NULL; 199 f->flush_mm = NULL;
204 f->flush_va = 0; 200 f->flush_va = 0;
205 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
206} 203}
207 204
208void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
211 if (is_uv_system()) { 208 if (is_uv_system()) {
212 unsigned int cpu; 209 unsigned int cpu;
213 210
214 cpu = get_cpu(); 211 cpu = smp_processor_id();
215 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
216 if (cpumask) 213 if (cpumask)
217 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
218 put_cpu();
219 return; 215 return;
220 } 216 }
221 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 72cbec14d783..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 129 dump_trace(NULL, regs, (unsigned long *)stack, 0,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index e2b7b0c06cdf..cf9750004a08 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -15,7 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/oprofile.h> 17#include <linux/oprofile.h>
18#include <linux/sysdev.h> 18#include <linux/syscore_ops.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/moduleparam.h> 20#include <linux/moduleparam.h>
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
@@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; 49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; 50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
51 val |= (counter_config->unit_mask & 0xFF) << 8; 51 val |= (counter_config->unit_mask & 0xFF) << 8;
52 counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
53 ARCH_PERFMON_EVENTSEL_EDGE |
54 ARCH_PERFMON_EVENTSEL_CMASK);
55 val |= counter_config->extra;
52 event &= model->event_mask ? model->event_mask : 0xFF; 56 event &= model->event_mask ? model->event_mask : 0xFF;
53 val |= event & 0xFF; 57 val |= event & 0xFF;
54 val |= (event & 0x0F00) << 24; 58 val |= (event & 0x0F00) << 24;
@@ -440,6 +444,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
440 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 444 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
441 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 445 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
442 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 446 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
447 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
443 } 448 }
444 449
445 return 0; 450 return 0;
@@ -536,7 +541,7 @@ static void nmi_shutdown(void)
536 541
537#ifdef CONFIG_PM 542#ifdef CONFIG_PM
538 543
539static int nmi_suspend(struct sys_device *dev, pm_message_t state) 544static int nmi_suspend(void)
540{ 545{
541 /* Only one CPU left, just stop that one */ 546 /* Only one CPU left, just stop that one */
542 if (nmi_enabled == 1) 547 if (nmi_enabled == 1)
@@ -544,49 +549,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
544 return 0; 549 return 0;
545} 550}
546 551
547static int nmi_resume(struct sys_device *dev) 552static void nmi_resume(void)
548{ 553{
549 if (nmi_enabled == 1) 554 if (nmi_enabled == 1)
550 nmi_cpu_start(NULL); 555 nmi_cpu_start(NULL);
551 return 0;
552} 556}
553 557
554static struct sysdev_class oprofile_sysclass = { 558static struct syscore_ops oprofile_syscore_ops = {
555 .name = "oprofile",
556 .resume = nmi_resume, 559 .resume = nmi_resume,
557 .suspend = nmi_suspend, 560 .suspend = nmi_suspend,
558}; 561};
559 562
560static struct sys_device device_oprofile = { 563static void __init init_suspend_resume(void)
561 .id = 0,
562 .cls = &oprofile_sysclass,
563};
564
565static int __init init_sysfs(void)
566{ 564{
567 int error; 565 register_syscore_ops(&oprofile_syscore_ops);
568
569 error = sysdev_class_register(&oprofile_sysclass);
570 if (error)
571 return error;
572
573 error = sysdev_register(&device_oprofile);
574 if (error)
575 sysdev_class_unregister(&oprofile_sysclass);
576
577 return error;
578} 566}
579 567
580static void exit_sysfs(void) 568static void exit_suspend_resume(void)
581{ 569{
582 sysdev_unregister(&device_oprofile); 570 unregister_syscore_ops(&oprofile_syscore_ops);
583 sysdev_class_unregister(&oprofile_sysclass);
584} 571}
585 572
586#else 573#else
587 574
588static inline int init_sysfs(void) { return 0; } 575static inline void init_suspend_resume(void) { }
589static inline void exit_sysfs(void) { } 576static inline void exit_suspend_resume(void) { }
590 577
591#endif /* CONFIG_PM */ 578#endif /* CONFIG_PM */
592 579
@@ -789,9 +776,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
789 776
790 mux_init(ops); 777 mux_init(ops);
791 778
792 ret = init_sysfs(); 779 init_suspend_resume();
793 if (ret)
794 return ret;
795 780
796 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 781 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
797 return 0; 782 return 0;
@@ -799,5 +784,5 @@ int __init op_nmi_init(struct oprofile_operations *ops)
799 784
800void op_nmi_exit(void) 785void op_nmi_exit(void)
801{ 786{
802 exit_sysfs(); 787 exit_suspend_resume();
803} 788}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index e28398df0df2..0b7b7b179cbe 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -22,6 +22,7 @@ struct op_counter_config {
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25 unsigned long extra;
25}; 26};
26 27
27extern struct op_counter_config counter_config[]; 28extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 9fadec074142..98ab13058f89 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -50,7 +50,7 @@ static inline void setup_num_counters(void)
50#endif 50#endif
51} 51}
52 52
53static int inline addr_increment(void) 53static inline int addr_increment(void)
54{ 54{
55#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
56 return smp_num_siblings == 2 ? 2 : 1; 56 return smp_num_siblings == 2 ? 2 : 1;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e27dffbbb1a7..026e4931d162 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void)
350 350
351#define ENABLE_CF8_EXT_CFG (1ULL << 46) 351#define ENABLE_CF8_EXT_CFG (1ULL << 46)
352 352
353static void enable_pci_io_ecs(void *unused) 353static void __cpuinit enable_pci_io_ecs(void *unused)
354{ 354{
355 u64 reg; 355 u64 reg;
356 rdmsrl(MSR_AMD64_NB_CFG, reg); 356 rdmsrl(MSR_AMD64_NB_CFG, reg);
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 85b68ef5e809..67858be4b52b 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/init.h> 35#include <linux/init.h>
36 36
37#include <asm/ce4100.h>
37#include <asm/pci_x86.h> 38#include <asm/pci_x86.h>
38 39
39struct sim_reg { 40struct sim_reg {
@@ -254,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
254static int ce4100_conf_read(unsigned int seg, unsigned int bus, 255static int ce4100_conf_read(unsigned int seg, unsigned int bus,
255 unsigned int devfn, int reg, int len, u32 *value) 256 unsigned int devfn, int reg, int len, u32 *value)
256{ 257{
257 int i, retval = 1; 258 int i;
258 259
259 if (bus == 1) { 260 if (bus == 1) {
260 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
@@ -306,10 +307,10 @@ struct pci_raw_ops ce4100_pci_conf = {
306 .write = ce4100_conf_write, 307 .write = ce4100_conf_write,
307}; 308};
308 309
309static int __init ce4100_pci_init(void) 310int __init ce4100_pci_init(void)
310{ 311{
311 init_sim_regs(); 312 init_sim_regs();
312 raw_pci_ops = &ce4100_pci_conf; 313 raw_pci_ops = &ce4100_pci_conf;
313 return 0; 314 /* Indicate caller that it should invoke pci_legacy_init() */
315 return 1;
314} 316}
315subsys_initcall(ce4100_pci_init);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b1805b78842f..494f2e7ea2b4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void)
241 e820_reserve_resources_late(); 241 e820_reserve_resources_late();
242 /* 242 /*
243 * Insert the IO APIC resources after PCI initialization has 243 * Insert the IO APIC resources after PCI initialization has
244 * occured to handle IO APICS that are mapped in on a BAR in 244 * occurred to handle IO APICS that are mapped in on a BAR in
245 * PCI space, but before trying to assign unassigned pci res. 245 * PCI space, but before trying to assign unassigned pci res.
246 */ 246 */
247 ioapic_insert_resources(); 247 ioapic_insert_resources();
@@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
304 /* 304 /*
305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
306 * To avoid attribute conflicts, request UC MINUS here 306 * To avoid attribute conflicts, request UC MINUS here
307 * aswell. 307 * as well.
308 */ 308 */
309 prot |= _PAGE_CACHE_UC_MINUS; 309 prot |= _PAGE_CACHE_UC_MINUS;
310 310
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 87e6c8323117..8201165bae28 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
597 return 1; 597 return 1;
598 } 598 }
599 599
600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && 600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
601 (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { 601 device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
602 r->name = "PIIX/ICH"; 606 r->name = "PIIX/ICH";
603 r->get = pirq_piix_get; 607 r->get = pirq_piix_get;
604 r->set = pirq_piix_set; 608 r->set = pirq_piix_set;
605 return 1; 609 return 1;
606 } 610 }
607 611
608 if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
609 (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
610 r->name = "PIIX/ICH";
611 r->get = pirq_piix_get;
612 r->set = pirq_piix_set;
613 return 1;
614 }
615 return 0; 612 return 0;
616} 613}
617 614
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a07d09f..e37b407a0ee8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
20#include <asm/xen/pci.h> 20#include <asm/xen/pci.h>
21 21
22#ifdef CONFIG_ACPI 22#ifdef CONFIG_ACPI
23static int xen_hvm_register_pirq(u32 gsi, int triggering) 23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
24 int trigger, int polarity)
24{ 25{
25 int rc, irq; 26 int rc, irq;
26 struct physdev_map_pirq map_irq; 27 struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
41 return -1; 42 return -1;
42 } 43 }
43 44
44 if (triggering == ACPI_EDGE_SENSITIVE) { 45 if (trigger == ACPI_EDGE_SENSITIVE) {
45 shareable = 0; 46 shareable = 0;
46 name = "ioapic-edge"; 47 name = "ioapic-edge";
47 } else { 48 } else {
@@ -49,18 +50,12 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
49 name = "ioapic-level"; 50 name = "ioapic-level";
50 } 51 }
51 52
52 irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); 53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
53 54
54 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
55 56
56 return irq; 57 return irq;
57} 58}
58
59static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
60 int trigger, int polarity)
61{
62 return xen_hvm_register_pirq(gsi, trigger);
63}
64#endif 59#endif
65 60
66#if defined(CONFIG_PCI_MSI) 61#if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
91 86
92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 87static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
93{ 88{
94 int irq, pirq, ret = 0; 89 int irq, pirq;
95 struct msi_desc *msidesc; 90 struct msi_desc *msidesc;
96 struct msi_msg msg; 91 struct msi_msg msg;
97 92
@@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
99 __read_msi_msg(msidesc, &msg); 94 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | 95 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); 96 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { 97 if (msg.data != XEN_PIRQ_MSI_DATA ||
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 98 xen_irq_from_pirq(pirq) < 0) {
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); 99 pirq = xen_allocate_pirq_msi(dev, msidesc);
105 if (irq < 0) 100 if (pirq < 0)
106 goto error; 101 goto error;
107 ret = set_irq_msi(irq, msidesc); 102 xen_msi_compose_msg(dev, pirq, &msg);
108 if (ret < 0) 103 __write_msi_msg(msidesc, &msg);
109 goto error_while; 104 dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" 105 } else {
111 " pirq=%d\n", irq, pirq); 106 dev_dbg(&dev->dev,
112 return 0; 107 "xen: msi already bound to pirq=%d\n", pirq);
113 } 108 }
114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); 110 (type == PCI_CAP_ID_MSIX) ?
116 if (irq < 0 || pirq < 0) 111 "msi-x" : "msi");
112 if (irq < 0)
117 goto error; 113 goto error;
118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 114 dev_dbg(&dev->dev,
119 xen_msi_compose_msg(dev, pirq, &msg); 115 "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
120 ret = set_irq_msi(irq, msidesc);
121 if (ret < 0)
122 goto error_while;
123 write_msi_msg(irq, &msg);
124 } 116 }
125 return 0; 117 return 0;
126 118
127error_while:
128 unbind_from_irqhandler(irq, NULL);
129error: 119error:
130 if (ret == -ENODEV) 120 dev_err(&dev->dev,
131 dev_err(&dev->dev, "Xen PCI frontend has not registered" \ 121 "Xen PCI frontend has not registered MSI/MSI-X support!\n");
132 " MSI/MSI-X support!\n"); 122 return -ENODEV;
133
134 return ret;
135} 123}
136 124
137/* 125/*
@@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
150 return -ENOMEM; 138 return -ENOMEM;
151 139
152 if (type == PCI_CAP_ID_MSIX) 140 if (type == PCI_CAP_ID_MSIX)
153 ret = xen_pci_frontend_enable_msix(dev, &v, nvec); 141 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
154 else 142 else
155 ret = xen_pci_frontend_enable_msi(dev, &v); 143 ret = xen_pci_frontend_enable_msi(dev, v);
156 if (ret) 144 if (ret)
157 goto error; 145 goto error;
158 i = 0; 146 i = 0;
159 list_for_each_entry(msidesc, &dev->msi_list, list) { 147 list_for_each_entry(msidesc, &dev->msi_list, list) {
160 irq = xen_allocate_pirq(v[i], 0, /* not sharable */ 148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
161 (type == PCI_CAP_ID_MSIX) ? 149 (type == PCI_CAP_ID_MSIX) ?
162 "pcifront-msi-x" : "pcifront-msi"); 150 "pcifront-msi-x" :
163 if (irq < 0) { 151 "pcifront-msi");
164 ret = -1; 152 if (irq < 0)
165 goto free; 153 goto free;
166 }
167
168 ret = set_irq_msi(irq, msidesc);
169 if (ret)
170 goto error_while;
171 i++; 154 i++;
172 } 155 }
173 kfree(v); 156 kfree(v);
174 return 0; 157 return 0;
175 158
176error_while:
177 unbind_from_irqhandler(irq, NULL);
178error: 159error:
179 if (ret == -ENODEV) 160 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
180 dev_err(&dev->dev, "Xen PCI frontend has not registered" \
181 " MSI/MSI-X support!\n");
182free: 161free:
183 kfree(v); 162 kfree(v);
184 return ret; 163 return ret;
@@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
193 xen_pci_frontend_disable_msix(dev); 172 xen_pci_frontend_disable_msix(dev);
194 else 173 else
195 xen_pci_frontend_disable_msi(dev); 174 xen_pci_frontend_disable_msi(dev);
175
176 /* Free the IRQ's and the msidesc using the generic code. */
177 default_teardown_msi_irqs(dev);
196} 178}
197 179
198static void xen_teardown_msi_irq(unsigned int irq) 180static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +182,91 @@ static void xen_teardown_msi_irq(unsigned int irq)
200 xen_destroy_irq(irq); 182 xen_destroy_irq(irq);
201} 183}
202 184
185#ifdef CONFIG_XEN_DOM0
203static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 186static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
204{ 187{
205 int irq, ret; 188 int ret = 0;
206 struct msi_desc *msidesc; 189 struct msi_desc *msidesc;
207 190
208 list_for_each_entry(msidesc, &dev->msi_list, list) { 191 list_for_each_entry(msidesc, &dev->msi_list, list) {
209 irq = xen_create_msi_irq(dev, msidesc, type); 192 struct physdev_map_pirq map_irq;
210 if (irq < 0)
211 return -1;
212 193
213 ret = set_irq_msi(irq, msidesc); 194 memset(&map_irq, 0, sizeof(map_irq));
214 if (ret) 195 map_irq.domid = DOMID_SELF;
215 goto error; 196 map_irq.type = MAP_PIRQ_TYPE_MSI;
216 } 197 map_irq.index = -1;
217 return 0; 198 map_irq.pirq = -1;
199 map_irq.bus = dev->bus->number;
200 map_irq.devfn = dev->devfn;
218 201
219error: 202 if (type == PCI_CAP_ID_MSIX) {
220 xen_destroy_irq(irq); 203 int pos;
204 u32 table_offset, bir;
205
206 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
207
208 pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
209 &table_offset);
210 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
211
212 map_irq.table_base = pci_resource_start(dev, bir);
213 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
214 }
215
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
219 goto out;
220 }
221
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi");
226 if (ret < 0)
227 goto out;
228 }
229 ret = 0;
230out:
221 return ret; 231 return ret;
222} 232}
223#endif 233#endif
234#endif
224 235
225static int xen_pcifront_enable_irq(struct pci_dev *dev) 236static int xen_pcifront_enable_irq(struct pci_dev *dev)
226{ 237{
227 int rc; 238 int rc;
228 int share = 1; 239 int share = 1;
240 int pirq;
241 u8 gsi;
229 242
230 dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); 243 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
244 if (rc < 0) {
245 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
246 rc);
247 return rc;
248 }
231 249
232 if (dev->irq < 0) 250 rc = xen_allocate_pirq_gsi(gsi);
233 return -EINVAL; 251 if (rc < 0) {
252 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
253 gsi, rc);
254 return rc;
255 }
256 pirq = rc;
234 257
235 if (dev->irq < NR_IRQS_LEGACY) 258 if (gsi < NR_IRQS_LEGACY)
236 share = 0; 259 share = 0;
237 260
238 rc = xen_allocate_pirq(dev->irq, share, "pcifront"); 261 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
239 if (rc < 0) { 262 if (rc < 0) {
240 dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", 263 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
241 dev->irq, rc); 264 gsi, pirq, rc);
242 return rc; 265 return rc;
243 } 266 }
267
268 dev->irq = rc;
269 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
244 return 0; 270 return 0;
245} 271}
246 272
@@ -292,7 +318,7 @@ int __init pci_xen_hvm_init(void)
292#ifdef CONFIG_XEN_DOM0 318#ifdef CONFIG_XEN_DOM0
293static int xen_register_pirq(u32 gsi, int triggering) 319static int xen_register_pirq(u32 gsi, int triggering)
294{ 320{
295 int rc, irq; 321 int rc, pirq, irq = -1;
296 struct physdev_map_pirq map_irq; 322 struct physdev_map_pirq map_irq;
297 int shareable = 0; 323 int shareable = 0;
298 char *name; 324 char *name;
@@ -308,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering)
308 name = "ioapic-level"; 334 name = "ioapic-level";
309 } 335 }
310 336
311 irq = xen_allocate_pirq(gsi, shareable, name); 337 pirq = xen_allocate_pirq_gsi(gsi);
312 338 if (pirq < 0)
313 printk(KERN_DEBUG "xen: --> irq=%d\n", irq); 339 goto out;
314 340
341 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
315 if (irq < 0) 342 if (irq < 0)
316 goto out; 343 goto out;
317 344
345 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
346
318 map_irq.domid = DOMID_SELF; 347 map_irq.domid = DOMID_SELF;
319 map_irq.type = MAP_PIRQ_TYPE_GSI; 348 map_irq.type = MAP_PIRQ_TYPE_GSI;
320 map_irq.index = gsi; 349 map_irq.index = gsi;
321 map_irq.pirq = irq; 350 map_irq.pirq = pirq;
322 351
323 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 352 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
324 if (rc) { 353 if (rc) {
@@ -405,13 +434,18 @@ static int __init pci_xen_initial_domain(void)
405 434
406void __init xen_setup_pirqs(void) 435void __init xen_setup_pirqs(void)
407{ 436{
408 int irq; 437 int pirq, irq;
409 438
410 pci_xen_initial_domain(); 439 pci_xen_initial_domain();
411 440
412 if (0 == nr_ioapics) { 441 if (0 == nr_ioapics) {
413 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) 442 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
414 xen_allocate_pirq(irq, 0, "xt-pic"); 443 pirq = xen_allocate_pirq_gsi(irq);
444 if (WARN(pirq < 0,
445 "Could not allocate PIRQ for legacy interrupt\n"))
446 break;
447 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
448 }
415 return; 449 return;
416 } 450 }
417 451
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index d2c0d51a7178..28071bb31db7 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -15,21 +15,20 @@
15#include <linux/serial_reg.h> 15#include <linux/serial_reg.h>
16#include <linux/serial_8250.h> 16#include <linux/serial_8250.h>
17 17
18#include <asm/ce4100.h>
19#include <asm/prom.h>
18#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/i8259.h>
19#include <asm/io.h> 22#include <asm/io.h>
23#include <asm/io_apic.h>
20 24
21static int ce4100_i8042_detect(void) 25static int ce4100_i8042_detect(void)
22{ 26{
23 return 0; 27 return 0;
24} 28}
25 29
26static void __init sdv_find_smp_config(void)
27{
28}
29
30#ifdef CONFIG_SERIAL_8250 30#ifdef CONFIG_SERIAL_8250
31 31
32
33static unsigned int mem_serial_in(struct uart_port *p, int offset) 32static unsigned int mem_serial_in(struct uart_port *p, int offset)
34{ 33{
35 offset = offset << p->regshift; 34 offset = offset << p->regshift;
@@ -118,6 +117,15 @@ static void __init sdv_arch_setup(void)
118 sdv_serial_fixup(); 117 sdv_serial_fixup();
119} 118}
120 119
120#ifdef CONFIG_X86_IO_APIC
121static void __cpuinit sdv_pci_init(void)
122{
123 x86_of_pci_init();
124 /* We can't set this earlier, because we need to calibrate the timer */
125 legacy_pic = &null_legacy_pic;
126}
127#endif
128
121/* 129/*
122 * CE4100 specific x86_init function overrides and early setup 130 * CE4100 specific x86_init function overrides and early setup
123 * calls. 131 * calls.
@@ -128,5 +136,11 @@ void __init x86_ce4100_early_setup(void)
128 x86_platform.i8042_detect = ce4100_i8042_detect; 136 x86_platform.i8042_detect = ce4100_i8042_detect;
129 x86_init.resources.probe_roms = x86_init_noop; 137 x86_init.resources.probe_roms = x86_init_noop;
130 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 138 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
131 x86_init.mpparse.find_smp_config = sdv_find_smp_config; 139 x86_init.mpparse.find_smp_config = x86_init_noop;
140 x86_init.pci.init = ce4100_pci_init;
141
142#ifdef CONFIG_X86_IO_APIC
143 x86_init.pci.init_irq = sdv_pci_init;
144 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
145#endif
132} 146}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 000000000000..2d6d226f2b10
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,430 @@
1/*
2 * CE4100 on Falcon Falls
3 *
4 * (c) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; version 2 of the License.
9 */
10/dts-v1/;
11/ {
12 model = "intel,falconfalls";
13 compatible = "intel,falconfalls";
14 #address-cells = <1>;
15 #size-cells = <1>;
16
17 cpus {
18 #address-cells = <1>;
19 #size-cells = <0>;
20
21 cpu@0 {
22 device_type = "cpu";
23 compatible = "intel,ce4100";
24 reg = <0>;
25 lapic = <&lapic0>;
26 };
27 };
28
29 soc@0 {
30 #address-cells = <1>;
31 #size-cells = <1>;
32 compatible = "intel,ce4100-cp";
33 ranges;
34
35 ioapic1: interrupt-controller@fec00000 {
36 #interrupt-cells = <2>;
37 compatible = "intel,ce4100-ioapic";
38 interrupt-controller;
39 reg = <0xfec00000 0x1000>;
40 };
41
42 timer@fed00000 {
43 compatible = "intel,ce4100-hpet";
44 reg = <0xfed00000 0x200>;
45 };
46
47 lapic0: interrupt-controller@fee00000 {
48 compatible = "intel,ce4100-lapic";
49 reg = <0xfee00000 0x1000>;
50 };
51
52 pci@3fc {
53 #address-cells = <3>;
54 #size-cells = <2>;
55 compatible = "intel,ce4100-pci", "pci";
56 device_type = "pci";
57 bus-range = <0 0>;
58 ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
59 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
60 0x0000000 0 0x0 0x0 0 0x100>;
61
62 /* Secondary IO-APIC */
63 ioapic2: interrupt-controller@0,1 {
64 #interrupt-cells = <2>;
65 compatible = "intel,ce4100-ioapic";
66 interrupt-controller;
67 reg = <0x100 0x0 0x0 0x0 0x0>;
68 assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
69 };
70
71 pci@1,0 {
72 #address-cells = <3>;
73 #size-cells = <2>;
74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci";
76 bus-range = <1 1>;
77 reg = <0x0800 0x0 0x0 0x0 0x0>;
78 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
79
80 interrupt-parent = <&ioapic2>;
81
82 display@2,0 {
83 compatible = "pci8086,2e5b.2",
84 "pci8086,2e5b",
85 "pciclass038000",
86 "pciclass0380";
87
88 reg = <0x11000 0x0 0x0 0x0 0x0>;
89 interrupts = <0 1>;
90 };
91
92 multimedia@3,0 {
93 compatible = "pci8086,2e5c.2",
94 "pci8086,2e5c",
95 "pciclass048000",
96 "pciclass0480";
97
98 reg = <0x11800 0x0 0x0 0x0 0x0>;
99 interrupts = <2 1>;
100 };
101
102 multimedia@4,0 {
103 compatible = "pci8086,2e5d.2",
104 "pci8086,2e5d",
105 "pciclass048000",
106 "pciclass0480";
107
108 reg = <0x12000 0x0 0x0 0x0 0x0>;
109 interrupts = <4 1>;
110 };
111
112 multimedia@4,1 {
113 compatible = "pci8086,2e5e.2",
114 "pci8086,2e5e",
115 "pciclass048000",
116 "pciclass0480";
117
118 reg = <0x12100 0x0 0x0 0x0 0x0>;
119 interrupts = <5 1>;
120 };
121
122 sound@6,0 {
123 compatible = "pci8086,2e5f.2",
124 "pci8086,2e5f",
125 "pciclass040100",
126 "pciclass0401";
127
128 reg = <0x13000 0x0 0x0 0x0 0x0>;
129 interrupts = <6 1>;
130 };
131
132 sound@6,1 {
133 compatible = "pci8086,2e5f.2",
134 "pci8086,2e5f",
135 "pciclass040100",
136 "pciclass0401";
137
138 reg = <0x13100 0x0 0x0 0x0 0x0>;
139 interrupts = <7 1>;
140 };
141
142 sound@6,2 {
143 compatible = "pci8086,2e60.2",
144 "pci8086,2e60",
145 "pciclass040100",
146 "pciclass0401";
147
148 reg = <0x13200 0x0 0x0 0x0 0x0>;
149 interrupts = <8 1>;
150 };
151
152 display@8,0 {
153 compatible = "pci8086,2e61.2",
154 "pci8086,2e61",
155 "pciclass038000",
156 "pciclass0380";
157
158 reg = <0x14000 0x0 0x0 0x0 0x0>;
159 interrupts = <9 1>;
160 };
161
162 display@8,1 {
163 compatible = "pci8086,2e62.2",
164 "pci8086,2e62",
165 "pciclass038000",
166 "pciclass0380";
167
168 reg = <0x14100 0x0 0x0 0x0 0x0>;
169 interrupts = <10 1>;
170 };
171
172 multimedia@8,2 {
173 compatible = "pci8086,2e63.2",
174 "pci8086,2e63",
175 "pciclass048000",
176 "pciclass0480";
177
178 reg = <0x14200 0x0 0x0 0x0 0x0>;
179 interrupts = <11 1>;
180 };
181
182 entertainment-encryption@9,0 {
183 compatible = "pci8086,2e64.2",
184 "pci8086,2e64",
185 "pciclass101000",
186 "pciclass1010";
187
188 reg = <0x14800 0x0 0x0 0x0 0x0>;
189 interrupts = <12 1>;
190 };
191
192 localbus@a,0 {
193 compatible = "pci8086,2e65.2",
194 "pci8086,2e65",
195 "pciclassff0000",
196 "pciclassff00";
197
198 reg = <0x15000 0x0 0x0 0x0 0x0>;
199 };
200
201 serial@b,0 {
202 compatible = "pci8086,2e66.2",
203 "pci8086,2e66",
204 "pciclass070003",
205 "pciclass0700";
206
207 reg = <0x15800 0x0 0x0 0x0 0x0>;
208 interrupts = <14 1>;
209 };
210
211 gpio@b,1 {
212 compatible = "pci8086,2e67.2",
213 "pci8086,2e67",
214 "pciclassff0000",
215 "pciclassff00";
216
217 #gpio-cells = <2>;
218 reg = <0x15900 0x0 0x0 0x0 0x0>;
219 interrupts = <15 1>;
220 gpio-controller;
221 };
222
223 i2c-controller@b,2 {
224 #address-cells = <2>;
225 #size-cells = <1>;
226 compatible = "pci8086,2e68.2",
227 "pci8086,2e68",
228 "pciclass,ff0000",
229 "pciclass,ff00";
230
231 reg = <0x15a00 0x0 0x0 0x0 0x0>;
232 interrupts = <16 1>;
233 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
234 1 0 0x02000000 0 0xdffe0600 0x100
235 2 0 0x02000000 0 0xdffe0700 0x100>;
236
237 i2c@0 {
238 #address-cells = <1>;
239 #size-cells = <0>;
240 compatible = "intel,ce4100-i2c-controller";
241 reg = <0 0 0x100>;
242 };
243
244 i2c@1 {
245 #address-cells = <1>;
246 #size-cells = <0>;
247 compatible = "intel,ce4100-i2c-controller";
248 reg = <1 0 0x100>;
249
250 gpio@26 {
251 #gpio-cells = <2>;
252 compatible = "ti,pcf8575";
253 reg = <0x26>;
254 gpio-controller;
255 };
256 };
257
258 i2c@2 {
259 #address-cells = <1>;
260 #size-cells = <0>;
261 compatible = "intel,ce4100-i2c-controller";
262 reg = <2 0 0x100>;
263
264 gpio@26 {
265 #gpio-cells = <2>;
266 compatible = "ti,pcf8575";
267 reg = <0x26>;
268 gpio-controller;
269 };
270 };
271 };
272
273 smard-card@b,3 {
274 compatible = "pci8086,2e69.2",
275 "pci8086,2e69",
276 "pciclass070500",
277 "pciclass0705";
278
279 reg = <0x15b00 0x0 0x0 0x0 0x0>;
280 interrupts = <15 1>;
281 };
282
283 spi-controller@b,4 {
284 #address-cells = <1>;
285 #size-cells = <0>;
286 compatible =
287 "pci8086,2e6a.2",
288 "pci8086,2e6a",
289 "pciclass,ff0000",
290 "pciclass,ff00";
291
292 reg = <0x15c00 0x0 0x0 0x0 0x0>;
293 interrupts = <15 1>;
294
295 dac@0 {
296 compatible = "ti,pcm1755";
297 reg = <0>;
298 spi-max-frequency = <115200>;
299 };
300
301 dac@1 {
302 compatible = "ti,pcm1609a";
303 reg = <1>;
304 spi-max-frequency = <115200>;
305 };
306
307 eeprom@2 {
308 compatible = "atmel,at93c46";
309 reg = <2>;
310 spi-max-frequency = <115200>;
311 };
312 };
313
314 multimedia@b,7 {
315 compatible = "pci8086,2e6d.2",
316 "pci8086,2e6d",
317 "pciclassff0000",
318 "pciclassff00";
319
320 reg = <0x15f00 0x0 0x0 0x0 0x0>;
321 };
322
323 ethernet@c,0 {
324 compatible = "pci8086,2e6e.2",
325 "pci8086,2e6e",
326 "pciclass020000",
327 "pciclass0200";
328
329 reg = <0x16000 0x0 0x0 0x0 0x0>;
330 interrupts = <21 1>;
331 };
332
333 clock@c,1 {
334 compatible = "pci8086,2e6f.2",
335 "pci8086,2e6f",
336 "pciclassff0000",
337 "pciclassff00";
338
339 reg = <0x16100 0x0 0x0 0x0 0x0>;
340 interrupts = <3 1>;
341 };
342
343 usb@d,0 {
344 compatible = "pci8086,2e70.2",
345 "pci8086,2e70",
346 "pciclass0c0320",
347 "pciclass0c03";
348
349 reg = <0x16800 0x0 0x0 0x0 0x0>;
350 interrupts = <22 3>;
351 };
352
353 usb@d,1 {
354 compatible = "pci8086,2e70.2",
355 "pci8086,2e70",
356 "pciclass0c0320",
357 "pciclass0c03";
358
359 reg = <0x16900 0x0 0x0 0x0 0x0>;
360 interrupts = <22 3>;
361 };
362
363 sata@e,0 {
364 compatible = "pci8086,2e71.0",
365 "pci8086,2e71",
366 "pciclass010601",
367 "pciclass0106";
368
369 reg = <0x17000 0x0 0x0 0x0 0x0>;
370 interrupts = <23 3>;
371 };
372
373 flash@f,0 {
374 compatible = "pci8086,701.1",
375 "pci8086,701",
376 "pciclass050100",
377 "pciclass0501";
378
379 reg = <0x17800 0x0 0x0 0x0 0x0>;
380 interrupts = <13 1>;
381 };
382
383 entertainment-encryption@10,0 {
384 compatible = "pci8086,702.1",
385 "pci8086,702",
386 "pciclass101000",
387 "pciclass1010";
388
389 reg = <0x18000 0x0 0x0 0x0 0x0>;
390 };
391
392 co-processor@11,0 {
393 compatible = "pci8086,703.1",
394 "pci8086,703",
395 "pciclass0b4000",
396 "pciclass0b40";
397
398 reg = <0x18800 0x0 0x0 0x0 0x0>;
399 interrupts = <1 1>;
400 };
401
402 multimedia@12,0 {
403 compatible = "pci8086,704.0",
404 "pci8086,704",
405 "pciclass048000",
406 "pciclass0480";
407
408 reg = <0x19000 0x0 0x0 0x0 0x0>;
409 };
410 };
411
412 isa@1f,0 {
413 #address-cells = <2>;
414 #size-cells = <1>;
415 compatible = "isa";
416 reg = <0xf800 0x0 0x0 0x0 0x0>;
417 ranges = <1 0 0 0 0 0x100>;
418
419 rtc@70 {
420 compatible = "intel,ce4100-rtc", "motorola,mc146818";
421 interrupts = <8 3>;
422 interrupt-parent = <&ioapic1>;
423 ctrl-reg = <2>;
424 freq-reg = <0x26>;
425 reg = <1 0x70 2>;
426 };
427 };
428 };
429 };
430};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index ea6529e93c6f..275dbc19e2cf 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -31,6 +31,7 @@
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/io_apic.h> 32#include <asm/io_apic.h>
33#include <asm/mrst.h> 33#include <asm/mrst.h>
34#include <asm/mrst-vrtc.h>
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/i8259.h> 36#include <asm/i8259.h>
36#include <asm/intel_scu_ipc.h> 37#include <asm/intel_scu_ipc.h>
@@ -96,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
96 pentry->freq_hz, pentry->irq); 97 pentry->freq_hz, pentry->irq);
97 if (!pentry->irq) 98 if (!pentry->irq)
98 continue; 99 continue;
99 mp_irq.type = MP_IOAPIC; 100 mp_irq.type = MP_INTSRC;
100 mp_irq.irqtype = mp_INT; 101 mp_irq.irqtype = mp_INT;
101/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ 102/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
102 mp_irq.irqflag = 5; 103 mp_irq.irqflag = 5;
103 mp_irq.srcbus = 0; 104 mp_irq.srcbus = MP_BUS_ISA;
104 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 105 mp_irq.srcbusirq = pentry->irq; /* IRQ */
105 mp_irq.dstapic = MP_APIC_ALL; 106 mp_irq.dstapic = MP_APIC_ALL;
106 mp_irq.dstirq = pentry->irq; 107 mp_irq.dstirq = pentry->irq;
@@ -167,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { 168 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
168 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", 169 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
169 totallen, (u32)pentry->phys_addr, pentry->irq); 170 totallen, (u32)pentry->phys_addr, pentry->irq);
170 mp_irq.type = MP_IOAPIC; 171 mp_irq.type = MP_INTSRC;
171 mp_irq.irqtype = mp_INT; 172 mp_irq.irqtype = mp_INT;
172 mp_irq.irqflag = 0xf; /* level trigger and active low */ 173 mp_irq.irqflag = 0xf; /* level trigger and active low */
173 mp_irq.srcbus = 0; 174 mp_irq.srcbus = MP_BUS_ISA;
174 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 175 mp_irq.srcbusirq = pentry->irq; /* IRQ */
175 mp_irq.dstapic = MP_APIC_ALL; 176 mp_irq.dstapic = MP_APIC_ALL;
176 mp_irq.dstirq = pentry->irq; 177 mp_irq.dstirq = pentry->irq;
@@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void)
268 269
269 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 270 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
270 x86_platform.i8042_detect = mrst_i8042_detect; 271 x86_platform.i8042_detect = mrst_i8042_detect;
272 x86_init.timers.wallclock_init = mrst_rtc_init;
271 x86_init.pci.init = pci_mrst_init; 273 x86_init.pci.init = pci_mrst_init;
272 x86_init.pci.fixup_irqs = x86_init_noop; 274 x86_init.pci.fixup_irqs = x86_init_noop;
273 275
@@ -280,7 +282,7 @@ void __init x86_mrst_early_setup(void)
280 /* Avoid searching for BIOS MP tables */ 282 /* Avoid searching for BIOS MP tables */
281 x86_init.mpparse.find_smp_config = x86_init_noop; 283 x86_init.mpparse.find_smp_config = x86_init_noop;
282 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 284 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
283 285 set_bit(MP_BUS_ISA, mp_bus_not_pci);
284} 286}
285 287
286/* 288/*
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 32cd7edd71a0..73d70d65e76e 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,22 +100,16 @@ int vrtc_set_mmss(unsigned long nowtime)
100 100
101void __init mrst_rtc_init(void) 101void __init mrst_rtc_init(void)
102{ 102{
103 unsigned long rtc_paddr; 103 unsigned long vrtc_paddr;
104 void __iomem *virt_base;
105 104
106 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 105 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
107 if (!sfi_mrtc_num)
108 return;
109
110 rtc_paddr = sfi_mrtc_array[0].phys_addr;
111 106
112 /* vRTC's register address may not be page aligned */ 107 vrtc_paddr = sfi_mrtc_array[0].phys_addr;
113 set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); 108 if (!sfi_mrtc_num || !vrtc_paddr)
114 109 return;
115 virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
116 virt_base += rtc_paddr & ~PAGE_MASK;
117 vrtc_virt_base = virt_base;
118 110
111 vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
112 vrtc_paddr);
119 x86_platform.get_wallclock = vrtc_get_time; 113 x86_platform.get_wallclock = vrtc_get_time;
120 x86_platform.set_wallclock = vrtc_set_mmss; 114 x86_platform.set_wallclock = vrtc_set_mmss;
121} 115}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e797428b163b..c2a8cab65e5d 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o 4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 127775696d6c..ab81fb271760 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/platform_device.h> 16#include <linux/platform_device.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/mfd/core.h>
18 19
19#include <asm/io.h> 20#include <asm/io.h>
20#include <asm/olpc.h> 21#include <asm/olpc.h>
@@ -56,25 +57,24 @@ static void xo1_power_off(void)
56static int __devinit olpc_xo1_probe(struct platform_device *pdev) 57static int __devinit olpc_xo1_probe(struct platform_device *pdev)
57{ 58{
58 struct resource *res; 59 struct resource *res;
60 int err;
59 61
60 /* don't run on non-XOs */ 62 /* don't run on non-XOs */
61 if (!machine_is_olpc()) 63 if (!machine_is_olpc())
62 return -ENODEV; 64 return -ENODEV;
63 65
66 err = mfd_cell_enable(pdev);
67 if (err)
68 return err;
69
64 res = platform_get_resource(pdev, IORESOURCE_IO, 0); 70 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
65 if (!res) { 71 if (!res) {
66 dev_err(&pdev->dev, "can't fetch device resource info\n"); 72 dev_err(&pdev->dev, "can't fetch device resource info\n");
67 return -EIO; 73 return -EIO;
68 } 74 }
69
70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
71 dev_err(&pdev->dev, "can't request region\n");
72 return -EIO;
73 }
74
75 if (strcmp(pdev->name, "cs5535-pms") == 0) 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
76 pms_base = res->start; 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 77 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
78 acpi_base = res->start; 78 acpi_base = res->start;
79 79
80 /* If we have both addresses, we can override the poweroff hook */ 80 /* If we have both addresses, we can override the poweroff hook */
@@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev)
88 88
89static int __devexit olpc_xo1_remove(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
90{ 90{
91 struct resource *r; 91 mfd_cell_disable(pdev);
92
93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
95 92
96 if (strcmp(pdev->name, "cs5535-pms") == 0) 93 if (strcmp(pdev->name, "cs5535-pms") == 0)
97 pms_base = 0; 94 pms_base = 0;
98 else if (strcmp(pdev->name, "cs5535-acpi") == 0) 95 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
99 acpi_base = 0; 96 acpi_base = 0;
100 97
101 pm_power_off = NULL; 98 pm_power_off = NULL;
@@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = {
113 110
114static struct platform_driver cs5535_acpi_drv = { 111static struct platform_driver cs5535_acpi_drv = {
115 .driver = { 112 .driver = {
116 .name = "cs5535-acpi", 113 .name = "olpc-xo1-pm-acpi",
117 .owner = THIS_MODULE, 114 .owner = THIS_MODULE,
118 }, 115 },
119 .probe = olpc_xo1_probe, 116 .probe = olpc_xo1_probe,
@@ -143,7 +140,7 @@ static void __exit olpc_xo1_exit(void)
143 140
144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 141MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
145MODULE_LICENSE("GPL"); 142MODULE_LICENSE("GPL");
146MODULE_ALIAS("platform:olpc-xo1"); 143MODULE_ALIAS("platform:cs5535-pms");
147 144
148module_init(olpc_xo1_init); 145module_init(olpc_xo1_init);
149module_exit(olpc_xo1_exit); 146module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index dab874647530..044bda5b3174 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -140,8 +140,7 @@ void * __init prom_early_alloc(unsigned long size)
140 * wasted bootmem) and hand off chunks of it to callers. 140 * wasted bootmem) and hand off chunks of it to callers.
141 */ 141 */
142 res = alloc_bootmem(chunk_size); 142 res = alloc_bootmem(chunk_size);
143 if (!res) 143 BUG_ON(!res);
144 return NULL;
145 prom_early_allocated += chunk_size; 144 prom_early_allocated += chunk_size;
146 memset(res, 0, chunk_size); 145 memset(res, 0, chunk_size);
147 free_mem = chunk_size; 146 free_mem = chunk_size;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index df58e9cad96a..7cb6424317f6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/debugfs.h> 11#include <linux/debugfs.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/delay.h>
14 15
15#include <asm/mmu_context.h> 16#include <asm/mmu_context.h>
16#include <asm/uv/uv.h> 17#include <asm/uv/uv.h>
@@ -1364,11 +1365,11 @@ uv_activation_descriptor_init(int node, int pnode)
1364 memset(bd2, 0, sizeof(struct bau_desc)); 1365 memset(bd2, 0, sizeof(struct bau_desc));
1365 bd2->header.sw_ack_flag = 1; 1366 bd2->header.sw_ack_flag = 1;
1366 /* 1367 /*
1367 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub 1368 * base_dest_nodeid is the nasid of the first uvhub
1368 * in the partition. The bit map will indicate uvhub numbers, 1369 * in the partition. The bit map will indicate uvhub numbers,
1369 * which are 0-N in a partition. Pnodes are unique system-wide. 1370 * which are 0-N in a partition. Pnodes are unique system-wide.
1370 */ 1371 */
1371 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1372 bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
1372 bd2->header.dest_subnodeid = 0x10; /* the LB */ 1373 bd2->header.dest_subnodeid = 0x10; /* the LB */
1373 bd2->header.command = UV_NET_ENDPOINT_INTD; 1374 bd2->header.command = UV_NET_ENDPOINT_INTD;
1374 bd2->header.int_both = 1; 1375 bd2->header.int_both = 1;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460917d5..374a05d8ad22 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
131 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
132{ 132{
133 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
134 struct irq_cfg *cfg = get_irq_chip_data(irq); 134 struct irq_cfg *cfg = irq_get_chip_data(irq);
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
@@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
148 else 148 else
149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
150 150
151 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
152 irq_name); 152 irq_name);
153 153
154 mmr_value = 0; 154 mmr_value = 0;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 632037671746..c7abf13a213f 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -471,15 +471,7 @@ static unsigned int startup_piix4_master_irq(struct irq_data *data)
471{ 471{
472 legacy_pic->init(0); 472 legacy_pic->init(0);
473 enable_cobalt_irq(data); 473 enable_cobalt_irq(data);
474} 474 return 0;
475
476static void end_piix4_master_irq(struct irq_data *data)
477{
478 unsigned long flags;
479
480 spin_lock_irqsave(&cobalt_lock, flags);
481 enable_cobalt_irq(data);
482 spin_unlock_irqrestore(&cobalt_lock, flags);
483} 475}
484 476
485static struct irq_chip piix4_master_irq_type = { 477static struct irq_chip piix4_master_irq_type = {
@@ -492,7 +484,7 @@ static void pii4_mask(struct irq_data *data) { }
492 484
493static struct irq_chip piix4_virtual_irq_type = { 485static struct irq_chip piix4_virtual_irq_type = {
494 .name = "PIIX4-virtual", 486 .name = "PIIX4-virtual",
495 .mask = pii4_mask, 487 .irq_mask = pii4_mask,
496}; 488};
497 489
498/* 490/*
@@ -569,18 +561,20 @@ out_unlock:
569static struct irqaction master_action = { 561static struct irqaction master_action = {
570 .handler = piix4_master_intr, 562 .handler = piix4_master_intr,
571 .name = "PIIX4-8259", 563 .name = "PIIX4-8259",
564 .flags = IRQF_NO_THREAD,
572}; 565};
573 566
574static struct irqaction cascade_action = { 567static struct irqaction cascade_action = {
575 .handler = no_action, 568 .handler = no_action,
576 .name = "cascade", 569 .name = "cascade",
570 .flags = IRQF_NO_THREAD,
577}; 571};
578 572
579static inline void set_piix4_virtual_irq_type(void) 573static inline void set_piix4_virtual_irq_type(void)
580{ 574{
581 piix4_virtual_irq_type.enable = i8259A_chip.unmask; 575 piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask;
582 piix4_virtual_irq_type.disable = i8259A_chip.mask; 576 piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask;
583 piix4_virtual_irq_type.unmask = i8259A_chip.unmask; 577 piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask;
584} 578}
585 579
586static void __init visws_pre_intr_init(void) 580static void __init visws_pre_intr_init(void)
@@ -597,7 +591,7 @@ static void __init visws_pre_intr_init(void)
597 else if (i == CO_IRQ_IDE0) 591 else if (i == CO_IRQ_IDE0)
598 chip = &cobalt_irq_type; 592 chip = &cobalt_irq_type;
599 else if (i == CO_IRQ_IDE1) 593 else if (i == CO_IRQ_IDE1)
600 >chip = &cobalt_irq_type; 594 chip = &cobalt_irq_type;
601 else if (i == CO_IRQ_8259) 595 else if (i == CO_IRQ_8259)
602 chip = &piix4_master_irq_type; 596 chip = &piix4_master_irq_type;
603 else if (i < CO_IRQ_APIC0) 597 else if (i < CO_IRQ_APIC0)
@@ -606,7 +600,7 @@ static void __init visws_pre_intr_init(void)
606 chip = &cobalt_irq_type; 600 chip = &cobalt_irq_type;
607 601
608 if (chip) 602 if (chip)
609 set_irq_chip(i, chip); 603 irq_set_chip(i, chip);
610 } 604 }
611 605
612 setup_irq(CO_IRQ_8259, &master_action); 606 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 36df991985b2..468d591dde31 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma)
417 return NULL; 417 return NULL;
418} 418}
419 419
420struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 420struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
421{ 421{
422 struct mm_struct *mm = tsk->mm; 422 /*
423 423 * Check to see if the corresponding task was created in compat vdso
424 /* Check to see if this task was created in compat vdso mode */ 424 * mode.
425 */
425 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) 426 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
426 return &gate_vma; 427 return &gate_vma;
427 return NULL; 428 return NULL;
428} 429}
429 430
430int in_gate_area(struct task_struct *task, unsigned long addr) 431int in_gate_area(struct mm_struct *mm, unsigned long addr)
431{ 432{
432 const struct vm_area_struct *vma = get_gate_vma(task); 433 const struct vm_area_struct *vma = get_gate_vma(mm);
433 434
434 return vma && addr >= vma->vm_start && addr < vma->vm_end; 435 return vma && addr >= vma->vm_start && addr < vma->vm_end;
435} 436}
436 437
437int in_gate_area_no_task(unsigned long addr) 438int in_gate_area_no_mm(unsigned long addr)
438{ 439{
439 return 0; 440 return 0;
440} 441}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892e4bc3..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -38,7 +38,8 @@ config XEN_MAX_DOMAIN_MEMORY
38 38
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN && PM 41 depends on XEN
42 select HIBERNATE_CALLBACKS
42 default y 43 default y
43 44
44config XEN_DEBUG_FS 45config XEN_DEBUG_FS
@@ -48,3 +49,11 @@ config XEN_DEBUG_FS
48 help 49 help
49 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
50 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7e8d3bc80af6..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
238static __init void xen_init_cpuid_mask(void) 238static __init void xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
249 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
250 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
251 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
252
253 ax = 1; 253 ax = 1;
254 cx = 0;
255 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
256 255
257 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
258 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
259 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
260
261 set_in_cr4(X86_CR4_OSXSAVE);
262
263 cr4 = read_cr4();
264
265 if ((cr4 & X86_CR4_OSXSAVE) == 0)
266 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
267 259
268 clear_in_cr4(X86_CR4_OSXSAVE); 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
269 } 261 if ((cx & xsave_mask) != xsave_mask)
262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
270} 263}
271 264
272static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -1194,7 +1187,7 @@ asmlinkage void __init xen_start_kernel(void)
1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1187 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1195 1188
1196 local_irq_disable(); 1189 local_irq_disable();
1197 early_boot_irqs_off(); 1190 early_boot_irqs_disabled = true;
1198 1191
1199 memblock_init(); 1192 memblock_init();
1200 1193
@@ -1284,15 +1277,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1284 1277
1285 xen_setup_features(); 1278 xen_setup_features();
1286 1279
1287 pv_info = xen_info; 1280 pv_info.name = "Xen HVM";
1288 pv_info.kernel_rpl = 0;
1289 1281
1290 xen_domain_type = XEN_HVM_DOMAIN; 1282 xen_domain_type = XEN_HVM_DOMAIN;
1291 1283
1292 return 0; 1284 return 0;
1293} 1285}
1294 1286
1295void xen_hvm_init_shared_info(void) 1287void __ref xen_hvm_init_shared_info(void)
1296{ 1288{
1297 int cpu; 1289 int cpu;
1298 struct xen_add_to_physmap xatp; 1290 struct xen_add_to_physmap xatp;
@@ -1331,6 +1323,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1331 switch (action) { 1323 switch (action) {
1332 case CPU_UP_PREPARE: 1324 case CPU_UP_PREPARE:
1333 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1326 if (xen_have_vector_callback)
1327 xen_init_lock_cpu(cpu);
1334 break; 1328 break;
1335 default: 1329 default:
1336 break; 1330 break;
@@ -1355,6 +1349,7 @@ static void __init xen_hvm_guest_init(void)
1355 1349
1356 if (xen_feature(XENFEAT_hvm_callback_vector)) 1350 if (xen_feature(XENFEAT_hvm_callback_vector))
1357 xen_have_vector_callback = 1; 1351 xen_have_vector_callback = 1;
1352 xen_hvm_smp_init();
1358 register_cpu_notifier(&xen_hvm_cpu_notifier); 1353 register_cpu_notifier(&xen_hvm_cpu_notifier);
1359 xen_unplug_emulated_devices(); 1354 xen_unplug_emulated_devices();
1360 have_vcpu_info_placement = 0; 1355 have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..6a6fe8939645 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61ad574..aef7af92b28b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h>
49 50
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -78,8 +79,7 @@
78 79
79/* 80/*
80 * Protects atomic reservation decrease/increase against concurrent increases. 81 * Protects atomic reservation decrease/increase against concurrent increases.
81 * Also protects non-atomic updates of current_pages and driver_pages, and 82 * Also protects non-atomic updates of current_pages and balloon lists.
82 * balloon lists.
83 */ 83 */
84DEFINE_SPINLOCK(xen_reservation_lock); 84DEFINE_SPINLOCK(xen_reservation_lock);
85 85
@@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
416 if (val & _PAGE_PRESENT) { 416 if (val & _PAGE_PRESENT) {
417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 417 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 pteval_t flags = val & PTE_FLAGS_MASK; 418 pteval_t flags = val & PTE_FLAGS_MASK;
419 unsigned long mfn = pfn_to_mfn(pfn); 419 unsigned long mfn;
420 420
421 if (!xen_feature(XENFEAT_auto_translated_physmap))
422 mfn = get_phys_to_machine(pfn);
423 else
424 mfn = pfn;
421 /* 425 /*
422 * If there's no mfn for the pfn, then just create an 426 * If there's no mfn for the pfn, then just create an
423 * empty non-present pte. Unfortunately this loses 427 * empty non-present pte. Unfortunately this loses
@@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
427 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 431 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 mfn = 0; 432 mfn = 0;
429 flags = 0; 433 flags = 0;
434 } else {
435 /*
436 * Paramount to do this test _after_ the
437 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
438 * IDENTITY_FRAME_BIT resolves to true.
439 */
440 mfn &= ~FOREIGN_FRAME_BIT;
441 if (mfn & IDENTITY_FRAME_BIT) {
442 mfn &= ~IDENTITY_FRAME_BIT;
443 flags |= _PAGE_IOMAP;
444 }
430 } 445 }
431
432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 446 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 } 447 }
434 448
@@ -532,6 +546,41 @@ pte_t xen_make_pte(pteval_t pte)
532} 546}
533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 547PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 548
549#ifdef CONFIG_XEN_DEBUG
550pte_t xen_make_pte_debug(pteval_t pte)
551{
552 phys_addr_t addr = (pte & PTE_PFN_MASK);
553 phys_addr_t other_addr;
554 bool io_page = false;
555 pte_t _pte;
556
557 if (pte & _PAGE_IOMAP)
558 io_page = true;
559
560 _pte = xen_make_pte(pte);
561
562 if (!addr)
563 return _pte;
564
565 if (io_page &&
566 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
567 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
568 WARN_ONCE(addr != other_addr,
569 "0x%lx is using VM_IO, but it is 0x%lx!\n",
570 (unsigned long)addr, (unsigned long)other_addr);
571 } else {
572 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
573 other_addr = (_pte.pte & PTE_PFN_MASK);
574 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
575 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
576 (unsigned long)addr);
577 }
578
579 return _pte;
580}
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif
583
535pgd_t xen_make_pgd(pgdval_t pgd) 584pgd_t xen_make_pgd(pgdval_t pgd)
536{ 585{
537 pgd = pte_pfn_to_mfn(pgd); 586 pgd = pte_pfn_to_mfn(pgd);
@@ -986,10 +1035,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
986 */ 1035 */
987void xen_mm_pin_all(void) 1036void xen_mm_pin_all(void)
988{ 1037{
989 unsigned long flags;
990 struct page *page; 1038 struct page *page;
991 1039
992 spin_lock_irqsave(&pgd_lock, flags); 1040 spin_lock(&pgd_lock);
993 1041
994 list_for_each_entry(page, &pgd_list, lru) { 1042 list_for_each_entry(page, &pgd_list, lru) {
995 if (!PagePinned(page)) { 1043 if (!PagePinned(page)) {
@@ -998,7 +1046,7 @@ void xen_mm_pin_all(void)
998 } 1046 }
999 } 1047 }
1000 1048
1001 spin_unlock_irqrestore(&pgd_lock, flags); 1049 spin_unlock(&pgd_lock);
1002} 1050}
1003 1051
1004/* 1052/*
@@ -1099,10 +1147,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
1099 */ 1147 */
1100void xen_mm_unpin_all(void) 1148void xen_mm_unpin_all(void)
1101{ 1149{
1102 unsigned long flags;
1103 struct page *page; 1150 struct page *page;
1104 1151
1105 spin_lock_irqsave(&pgd_lock, flags); 1152 spin_lock(&pgd_lock);
1106 1153
1107 list_for_each_entry(page, &pgd_list, lru) { 1154 list_for_each_entry(page, &pgd_list, lru) {
1108 if (PageSavePinned(page)) { 1155 if (PageSavePinned(page)) {
@@ -1112,7 +1159,7 @@ void xen_mm_unpin_all(void)
1112 } 1159 }
1113 } 1160 }
1114 1161
1115 spin_unlock_irqrestore(&pgd_lock, flags); 1162 spin_unlock(&pgd_lock);
1116} 1163}
1117 1164
1118void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1165void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
@@ -1426,28 +1473,35 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1426#endif 1473#endif
1427} 1474}
1428 1475
1476#ifdef CONFIG_X86_32
1429static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1477static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1430{ 1478{
1431 unsigned long pfn = pte_pfn(pte);
1432
1433#ifdef CONFIG_X86_32
1434 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1479 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1435 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1480 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1436 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1481 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1437 pte_val_ma(pte)); 1482 pte_val_ma(pte));
1438#endif 1483
1484 return pte;
1485}
1486#else /* CONFIG_X86_64 */
1487static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1488{
1489 unsigned long pfn = pte_pfn(pte);
1439 1490
1440 /* 1491 /*
1441 * If the new pfn is within the range of the newly allocated 1492 * If the new pfn is within the range of the newly allocated
1442 * kernel pagetable, and it isn't being mapped into an 1493 * kernel pagetable, and it isn't being mapped into an
1443 * early_ioremap fixmap slot, make sure it is RO. 1494 * early_ioremap fixmap slot as a freshly allocated page, make sure
1495 * it is RO.
1444 */ 1496 */
1445 if (!is_early_ioremap_ptep(ptep) && 1497 if (((!is_early_ioremap_ptep(ptep) &&
1446 pfn >= e820_table_start && pfn < e820_table_end) 1498 pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
1499 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1447 pte = pte_wrprotect(pte); 1500 pte = pte_wrprotect(pte);
1448 1501
1449 return pte; 1502 return pte;
1450} 1503}
1504#endif /* CONFIG_X86_64 */
1451 1505
1452/* Init-time set_pte while constructing initial pagetables, which 1506/* Init-time set_pte while constructing initial pagetables, which
1453 doesn't allow RO pagetable pages to be remapped RW */ 1507 doesn't allow RO pagetable pages to be remapped RW */
@@ -1653,9 +1707,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1653 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1707 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1654 pte_t pte; 1708 pte_t pte;
1655 1709
1656 if (pfn > max_pfn_mapped)
1657 max_pfn_mapped = pfn;
1658
1659 if (!pte_none(pte_page[pteidx])) 1710 if (!pte_none(pte_page[pteidx]))
1660 continue; 1711 continue;
1661 1712
@@ -1697,7 +1748,7 @@ static void convert_pfn_mfn(void *v)
1697} 1748}
1698 1749
1699/* 1750/*
1700 * Set up the inital kernel pagetable. 1751 * Set up the initial kernel pagetable.
1701 * 1752 *
1702 * We can construct this by grafting the Xen provided pagetable into 1753 * We can construct this by grafting the Xen provided pagetable into
1703 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1754 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1713,6 +1764,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1713 pud_t *l3; 1764 pud_t *l3;
1714 pmd_t *l2; 1765 pmd_t *l2;
1715 1766
1767 /* max_pfn_mapped is the last pfn mapped in the initial memory
1768 * mappings. Considering that on Xen after the kernel mappings we
1769 * have the mappings of some pages that don't exist in pfn space, we
1770 * set max_pfn_mapped to the last real pfn mapped. */
1771 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1772
1716 /* Zap identity mapping */ 1773 /* Zap identity mapping */
1717 init_level4_pgt[0] = __pgd(0); 1774 init_level4_pgt[0] = __pgd(0);
1718 1775
@@ -1817,9 +1874,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1817 initial_kernel_pmd = 1874 initial_kernel_pmd =
1818 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1875 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1819 1876
1820 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1877 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1821 xen_start_info->nr_pt_frames * PAGE_SIZE +
1822 512*1024);
1823 1878
1824 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1879 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1825 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1880 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1942,6 +1997,9 @@ __init void xen_ident_map_ISA(void)
1942 1997
1943static __init void xen_post_allocator_init(void) 1998static __init void xen_post_allocator_init(void)
1944{ 1999{
2000#ifdef CONFIG_XEN_DEBUG
2001 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
2002#endif
1945 pv_mmu_ops.set_pte = xen_set_pte; 2003 pv_mmu_ops.set_pte = xen_set_pte;
1946 pv_mmu_ops.set_pmd = xen_set_pmd; 2004 pv_mmu_ops.set_pmd = xen_set_pmd;
1947 pv_mmu_ops.set_pud = xen_set_pud; 2005 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2074,7 +2132,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2074 in_frames[i] = virt_to_mfn(vaddr); 2132 in_frames[i] = virt_to_mfn(vaddr);
2075 2133
2076 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2134 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2077 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2135 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2078 2136
2079 if (out_frames) 2137 if (out_frames)
2080 out_frames[i] = virt_to_pfn(vaddr); 2138 out_frames[i] = virt_to_pfn(vaddr);
@@ -2353,6 +2411,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2353 2411
2354#ifdef CONFIG_XEN_DEBUG_FS 2412#ifdef CONFIG_XEN_DEBUG_FS
2355 2413
2414static int p2m_dump_open(struct inode *inode, struct file *filp)
2415{
2416 return single_open(filp, p2m_dump_show, NULL);
2417}
2418
2419static const struct file_operations p2m_dump_fops = {
2420 .open = p2m_dump_open,
2421 .read = seq_read,
2422 .llseek = seq_lseek,
2423 .release = single_release,
2424};
2425
2356static struct dentry *d_mmu_debug; 2426static struct dentry *d_mmu_debug;
2357 2427
2358static int __init xen_mmu_debugfs(void) 2428static int __init xen_mmu_debugfs(void)
@@ -2408,6 +2478,7 @@ static int __init xen_mmu_debugfs(void)
2408 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, 2478 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2409 &mmu_stats.prot_commit_batched); 2479 &mmu_stats.prot_commit_batched);
2410 2480
2481 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2411 return 0; 2482 return 0;
2412} 2483}
2413fs_initcall(xen_mmu_debugfs); 2484fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8f2251d2a3f8..141eb0de8b06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
26 */ 149 */
27 150
28#include <linux/init.h> 151#include <linux/init.h>
@@ -30,6 +153,7 @@
30#include <linux/list.h> 153#include <linux/list.h>
31#include <linux/hash.h> 154#include <linux/hash.h>
32#include <linux/sched.h> 155#include <linux/sched.h>
156#include <linux/seq_file.h>
33 157
34#include <asm/cache.h> 158#include <asm/cache.h>
35#include <asm/setup.h> 159#include <asm/setup.h>
@@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); 183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
65static inline unsigned p2m_top_index(unsigned long pfn) 195static inline unsigned p2m_top_index(unsigned long pfn)
66{ 196{
67 BUG_ON(pfn >= MAX_P2M_PFN); 197 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m)
136 * - After resume we're called from within stop_machine, but the mfn 266 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated. 267 * tree should alreay be completely allocated.
138 */ 268 */
139void xen_build_mfn_list_list(void) 269void __ref xen_build_mfn_list_list(void)
140{ 270{
141 unsigned long pfn; 271 unsigned long pfn;
142 272
@@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top); 352 p2m_top_init(p2m_top);
223 353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
224 /* 357 /*
225 * The domain builder gives us a pre-constructed p2m array in 358 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just 359 * mfn_list for all the pages initially given to us, so we just
@@ -237,6 +370,18 @@ void __init xen_build_dynamic_phys_to_machine(void)
237 p2m_top[topidx] = mid; 370 p2m_top[topidx] = mid;
238 } 371 }
239 372
373 /*
374 * As long as the mfn_list has enough entries to completely
375 * fill a p2m page, pointing into the array is ok. But if
376 * not the entries beyond the last pfn will be undefined.
377 */
378 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
379 unsigned long p2midx;
380
381 p2midx = max_pfn % P2M_PER_PAGE;
382 for ( ; p2midx < P2M_PER_PAGE; p2midx++)
383 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
384 }
240 p2m_top[topidx][mididx] = &mfn_list[pfn]; 385 p2m_top[topidx][mididx] = &mfn_list[pfn];
241 } 386 }
242 387
@@ -254,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
254 mididx = p2m_mid_index(pfn); 399 mididx = p2m_mid_index(pfn);
255 idx = p2m_index(pfn); 400 idx = p2m_index(pfn);
256 401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
257 return p2m_top[topidx][mididx][idx]; 410 return p2m_top[topidx][mididx][idx];
258} 411}
259EXPORT_SYMBOL_GPL(get_phys_to_machine); 412EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -323,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn)
323 p2m_top_mfn_p[topidx] = mid_mfn; 476 p2m_top_mfn_p[topidx] = mid_mfn;
324 } 477 }
325 478
326 if (p2m_top[topidx][mididx] == p2m_missing) { 479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
327 /* p2m leaf page is missing */ 481 /* p2m leaf page is missing */
328 unsigned long *p2m; 482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
329 484
330 p2m = alloc_p2m_page(); 485 p2m = alloc_p2m_page();
331 if (!p2m) 486 if (!p2m)
@@ -333,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn)
333 488
334 p2m_init(p2m); 489 p2m_init(p2m);
335 490
336 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) 491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
337 free_p2m_page(p2m); 492 free_p2m_page(p2m);
338 else 493 else
339 mid_mfn[mididx] = virt_to_mfn(p2m); 494 mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -342,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn)
342 return true; 497 return true;
343} 498}
344 499
500static bool __init __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525
526 p2m_init(p2m);
527
528 p2m_top[topidx][mididx] = p2m;
529
530 }
531 return idx != 0;
532}
533unsigned long __init set_phys_range_identity(unsigned long pfn_s,
534 unsigned long pfn_e)
535{
536 unsigned long pfn;
537
538 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
539 return 0;
540
541 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
542 return pfn_e - pfn_s;
543
544 if (pfn_s > pfn_e)
545 return 0;
546
547 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
548 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 {
551 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) {
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
556
557 p2m_top[topidx] = mid;
558 }
559 }
560
561 __early_alloc_p2m(pfn_s);
562 __early_alloc_p2m(pfn_e);
563
564 for (pfn = pfn_s; pfn < pfn_e; pfn++)
565 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
566 break;
567
568 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
569 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
570 (pfn_e - pfn_s) - (pfn - pfn_s)))
571 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
572
573 return pfn - pfn_s;
574}
575
345/* Try to install p2m mapping; fail if intermediate bits missing */ 576/* Try to install p2m mapping; fail if intermediate bits missing */
346bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 577bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
347{ 578{
348 unsigned topidx, mididx, idx; 579 unsigned topidx, mididx, idx;
349 580
581 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
582 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
583 return true;
584 }
350 if (unlikely(pfn >= MAX_P2M_PFN)) { 585 if (unlikely(pfn >= MAX_P2M_PFN)) {
351 BUG_ON(mfn != INVALID_P2M_ENTRY); 586 BUG_ON(mfn != INVALID_P2M_ENTRY);
352 return true; 587 return true;
@@ -356,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
356 mididx = p2m_mid_index(pfn); 591 mididx = p2m_mid_index(pfn);
357 idx = p2m_index(pfn); 592 idx = p2m_index(pfn);
358 593
594 /* For sparse holes were the p2m leaf has real PFN along with
595 * PCI holes, stick in the PFN as the MFN value.
596 */
597 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
598 if (p2m_top[topidx][mididx] == p2m_identity)
599 return true;
600
601 /* Swap over from MISSING to IDENTITY if needed. */
602 if (p2m_top[topidx][mididx] == p2m_missing) {
603 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
604 p2m_identity) != p2m_missing);
605 return true;
606 }
607 }
608
359 if (p2m_top[topidx][mididx] == p2m_missing) 609 if (p2m_top[topidx][mididx] == p2m_missing)
360 return mfn == INVALID_P2M_ENTRY; 610 return mfn == INVALID_P2M_ENTRY;
361 611
@@ -366,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
366 616
367bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 617bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368{ 618{
369 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
370 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
371 return true;
372 }
373
374 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 619 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
375 if (!alloc_p2m(pfn)) 620 if (!alloc_p2m(pfn))
376 return false; 621 return false;
@@ -409,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
409{ 654{
410 unsigned long flags; 655 unsigned long flags;
411 unsigned long pfn; 656 unsigned long pfn;
412 unsigned long address; 657 unsigned long uninitialized_var(address);
413 unsigned level; 658 unsigned level;
414 pte_t *ptep = NULL; 659 pte_t *ptep = NULL;
415 660
@@ -426,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
426 page->private = mfn; 671 page->private = mfn;
427 page->index = pfn_to_mfn(pfn); 672 page->index = pfn_to_mfn(pfn);
428 673
429 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); 674 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
675 return -ENOMEM;
676
430 if (!PageHighMem(page)) 677 if (!PageHighMem(page))
431 /* Just zap old mapping for now */ 678 /* Just zap old mapping for now */
432 pte_clear(&init_mm, address, ptep); 679 pte_clear(&init_mm, address, ptep);
@@ -443,7 +690,7 @@ int m2p_remove_override(struct page *page)
443 unsigned long flags; 690 unsigned long flags;
444 unsigned long mfn; 691 unsigned long mfn;
445 unsigned long pfn; 692 unsigned long pfn;
446 unsigned long address; 693 unsigned long uninitialized_var(address);
447 unsigned level; 694 unsigned level;
448 pte_t *ptep = NULL; 695 pte_t *ptep = NULL;
449 696
@@ -464,7 +711,7 @@ int m2p_remove_override(struct page *page)
464 spin_lock_irqsave(&m2p_override_lock, flags); 711 spin_lock_irqsave(&m2p_override_lock, flags);
465 list_del(&page->lru); 712 list_del(&page->lru);
466 spin_unlock_irqrestore(&m2p_override_lock, flags); 713 spin_unlock_irqrestore(&m2p_override_lock, flags);
467 __set_phys_to_machine(pfn, page->index); 714 set_phys_to_machine(pfn, page->index);
468 715
469 if (!PageHighMem(page)) 716 if (!PageHighMem(page))
470 set_pte_at(&init_mm, address, ptep, 717 set_pte_at(&init_mm, address, ptep,
@@ -508,3 +755,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
508 return ret; 755 return ret;
509} 756}
510EXPORT_SYMBOL_GPL(m2p_find_override_pfn); 757EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
758
759#ifdef CONFIG_XEN_DEBUG_FS
760
761int p2m_dump_show(struct seq_file *m, void *v)
762{
763 static const char * const level_name[] = { "top", "middle",
764 "entry", "abnormal" };
765 static const char * const type_name[] = { "identity", "missing",
766 "pfn", "abnormal"};
767#define TYPE_IDENTITY 0
768#define TYPE_MISSING 1
769#define TYPE_PFN 2
770#define TYPE_UNKNOWN 3
771 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
772 unsigned int uninitialized_var(prev_level);
773 unsigned int uninitialized_var(prev_type);
774
775 if (!p2m_top)
776 return 0;
777
778 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
779 unsigned topidx = p2m_top_index(pfn);
780 unsigned mididx = p2m_mid_index(pfn);
781 unsigned idx = p2m_index(pfn);
782 unsigned lvl, type;
783
784 lvl = 4;
785 type = TYPE_UNKNOWN;
786 if (p2m_top[topidx] == p2m_mid_missing) {
787 lvl = 0; type = TYPE_MISSING;
788 } else if (p2m_top[topidx] == NULL) {
789 lvl = 0; type = TYPE_UNKNOWN;
790 } else if (p2m_top[topidx][mididx] == NULL) {
791 lvl = 1; type = TYPE_UNKNOWN;
792 } else if (p2m_top[topidx][mididx] == p2m_identity) {
793 lvl = 1; type = TYPE_IDENTITY;
794 } else if (p2m_top[topidx][mididx] == p2m_missing) {
795 lvl = 1; type = TYPE_MISSING;
796 } else if (p2m_top[topidx][mididx][idx] == 0) {
797 lvl = 2; type = TYPE_UNKNOWN;
798 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
799 lvl = 2; type = TYPE_IDENTITY;
800 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
801 lvl = 2; type = TYPE_MISSING;
802 } else if (p2m_top[topidx][mididx][idx] == pfn) {
803 lvl = 2; type = TYPE_PFN;
804 } else if (p2m_top[topidx][mididx][idx] != pfn) {
805 lvl = 2; type = TYPE_PFN;
806 }
807 if (pfn == 0) {
808 prev_level = lvl;
809 prev_type = type;
810 }
811 if (pfn == MAX_DOMAIN_PAGES-1) {
812 lvl = 3;
813 type = TYPE_UNKNOWN;
814 }
815 if (prev_type != type) {
816 seq_printf(m, " [0x%lx->0x%lx] %s\n",
817 prev_pfn_type, pfn, type_name[prev_type]);
818 prev_pfn_type = pfn;
819 prev_type = type;
820 }
821 if (prev_level != lvl) {
822 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
823 prev_pfn_level, pfn, level_name[prev_level]);
824 prev_pfn_level = pfn;
825 prev_level = lvl;
826 }
827 }
828 return 0;
829#undef TYPE_IDENTITY
830#undef TYPE_MISSING
831#undef TYPE_PFN
832#undef TYPE_UNKNOWN
833}
834#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b5a7f928234b..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static __init void xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn;
56
55 u64 size = (u64)pages * PAGE_SIZE; 57 u64 size = (u64)pages * PAGE_SIZE;
56 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57 59
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
66 xen_extra_mem_size += size; 68 xen_extra_mem_size += size;
67 69
68 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
69} 74}
70 75
71static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
104 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105 start, end, ret); 110 start, end, ret);
106 if (ret == 1) { 111 if (ret == 1) {
107 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108 len++; 113 len++;
109 } 114 }
110 } 115 }
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
138 return released; 143 return released;
139} 144}
140 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if (entry->type == E820_RAM) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
141/** 188/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 **/ 190 **/
144char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
145{ 192{
146 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
147 195
148 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
149 unsigned long long mem_end; 197 unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
151 struct xen_memory_map memmap; 199 struct xen_memory_map memmap;
152 unsigned long extra_pages = 0; 200 unsigned long extra_pages = 0;
153 unsigned long extra_limit; 201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
154 int i; 203 int i;
155 int op; 204 int op;
156 205
@@ -176,11 +225,17 @@ char * __init xen_memory_setup(void)
176 } 225 }
177 BUG_ON(rc); 226 BUG_ON(rc);
178 227
228 memcpy(map_raw, map, sizeof(map));
179 e820.nr_map = 0; 229 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 230 xen_extra_mem_start = max((1ULL << 32), mem_end);
181 for (i = 0; i < memmap.nr_entries; i++) { 231 for (i = 0; i < memmap.nr_entries; i++) {
182 unsigned long long end = map[i].addr + map[i].size; 232 unsigned long long end;
233
234 /* Guard against non-page aligned E820 entries. */
235 if (map[i].type == E820_RAM)
236 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
183 237
238 end = map[i].addr + map[i].size;
184 if (map[i].type == E820_RAM && end > mem_end) { 239 if (map[i].type == E820_RAM && end > mem_end) {
185 /* RAM off the end - may be partially included */ 240 /* RAM off the end - may be partially included */
186 u64 delta = min(map[i].size, end - mem_end); 241 u64 delta = min(map[i].size, end - mem_end);
@@ -189,6 +244,15 @@ char * __init xen_memory_setup(void)
189 end -= delta; 244 end -= delta;
190 245
191 extra_pages += PFN_DOWN(delta); 246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
192 } 256 }
193 257
194 if (map[i].size > 0 && end > xen_extra_mem_start) 258 if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -246,6 +310,13 @@ char * __init xen_memory_setup(void)
246 310
247 xen_add_extra_mem(extra_pages); 311 xen_add_extra_mem(extra_pages);
248 312
313 /*
314 * Set P2M for all non-RAM pages and E820 gaps to be identity
315 * type PFNs. We supply it with the non-sanitized version
316 * of the E820.
317 */
318 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
319 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
249 return "Xen"; 320 return "Xen";
250} 321}
251 322
@@ -350,6 +421,7 @@ void __init xen_arch_setup(void)
350 boot_cpu_data.hlt_works_ok = 1; 421 boot_cpu_data.hlt_works_ok = 1;
351#endif 422#endif
352 pm_idle = default_idle; 423 pm_idle = default_idle;
424 boot_option_idle_override = IDLE_HALT;
353 425
354 fiddle_vdso(); 426 fiddle_vdso();
355} 427}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c7959045..30612441ed99 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
509 xen_fill_possible_map(); 509 xen_fill_possible_map();
510 xen_init_spinlocks(); 510 xen_init_spinlocks();
511} 511}
512
513static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
514{
515 native_smp_prepare_cpus(max_cpus);
516 WARN_ON(xen_smp_intr_init(0));
517
518 if (!xen_have_vector_callback)
519 return;
520 xen_init_lock_cpu(0);
521 xen_init_spinlocks();
522}
523
524static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
525{
526 int rc;
527 rc = native_cpu_up(cpu);
528 WARN_ON (xen_smp_intr_init(cpu));
529 return rc;
530}
531
532static void xen_hvm_cpu_die(unsigned int cpu)
533{
534 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
535 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
536 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
537 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
538 native_cpu_die(cpu);
539}
540
541void __init xen_hvm_smp_init(void)
542{
543 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
544 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
545 smp_ops.cpu_up = xen_hvm_cpu_up;
546 smp_ops.cpu_die = xen_hvm_cpu_die;
547 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
548 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
549}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a129b5..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
37 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
38 } 39 }
39 } 40 }
41#endif
40} 42}
41 43
42void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
43{ 45{
44 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
45 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e3d6a5..2e2d370a47b1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
397 name = "<timer kasprintf failed>"; 397 name = "<timer kasprintf failed>";
398 398
399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 400 IRQF_DISABLED|IRQF_PERCPU|
401 IRQF_NOBALANCING|IRQF_TIMER|
402 IRQF_FORCE_RESUME,
401 name, NULL); 403 name, NULL);
402 404
403 evt = &per_cpu(xen_clock_events, cpu); 405 evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf985757..3112f55638c4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
64 64
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
67 68
68extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
69#else 70#else
70static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
71#endif 73#endif
72 74
73#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS