aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig66
-rw-r--r--arch/x86/Kconfig.cpu16
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c9
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/alternative.h9
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h13
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h28
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apic.h38
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bios_ebda.h28
-rw-r--r--arch/x86/include/asm/cpufeature.h15
-rw-r--r--arch/x86/include/asm/dma.h12
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/gart.h24
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/io.h24
-rw-r--r--arch/x86/include/asm/io_apic.h30
-rw-r--r--arch/x86/include/asm/jump_label.h27
-rw-r--r--arch/x86/include/asm/kgdb.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h193
-rw-r--r--arch/x86/include/asm/kvm_host.h55
-rw-r--r--arch/x86/include/asm/linkage.h5
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mmzone_32.h20
-rw-r--r--arch/x86/include/asm/mmzone_64.h23
-rw-r--r--arch/x86/include/asm/module.h2
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/nops.h146
-rw-r--r--arch/x86/include/asm/numa.h34
-rw-r--r--arch/x86/include/asm/numa_32.h10
-rw-r--r--arch/x86/include/asm/numa_64.h36
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/percpu.h34
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/ptrace.h18
-rw-r--r--arch/x86/include/asm/setup.h4
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/system.h85
-rw-r--r--arch/x86/include/asm/topology.h8
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/uaccess_32.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h17
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h16
-rw-r--r--arch/x86/include/asm/vdso.h14
-rw-r--r--arch/x86/include/asm/vgtod.h2
-rw-r--r--arch/x86/include/asm/vsyscall.h12
-rw-r--r--arch/x86/include/asm/vvar.h52
-rw-r--r--arch/x86/include/asm/x2apic.h62
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/include/asm/xen/hypercall.h7
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/asm/xen/pci.h16
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/acpi/sleep.c5
-rw-r--r--arch/x86/kernel/alternative.c203
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)9
-rw-r--r--arch/x86/kernel/amd_iommu.c527
-rw-r--r--arch/x86/kernel/amd_iommu_init.c48
-rw-r--r--arch/x86/kernel/apb_timer.c10
-rw-r--r--arch/x86/kernel/aperture_64.c36
-rw-r--r--arch/x86/kernel/apic/Makefile17
-rw-r--r--arch/x86/kernel/apic/apic.c117
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c26
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c17
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c319
-rw-r--r--arch/x86/kernel/apic/numaq_32.c40
-rw-r--r--arch/x86/kernel/apic/probe_32.c118
-rw-r--r--arch/x86/kernel/apic/probe_64.c61
-rw-r--r--arch/x86/kernel/apic/summit_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c222
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c115
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c61
-rw-r--r--arch/x86/kernel/apm_32.c9
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c24
-rw-r--r--arch/x86/kernel/cpu/common.c26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c776
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c624
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1607
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event.c50
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c162
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c26
-rw-r--r--arch/x86/kernel/devicetree.c8
-rw-r--r--arch/x86/kernel/dumpstack.c17
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/hpet.c72
-rw-r--r--arch/x86/kernel/i8253.c86
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/jump_label.c5
-rw-r--r--arch/x86/kernel/kprobes.c5
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c8
-rw-r--r--arch/x86/kernel/pci-dma.c64
-rw-r--r--arch/x86/kernel/pci-iommu_table.c18
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c4
-rw-r--r--arch/x86/kernel/ptrace.c40
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/reboot_32.S12
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/signal.c14
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/stacktrace.c13
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/vmlinux.lds.S43
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c48
-rw-r--r--arch/x86/kernel/x86_init.c6
-rw-r--r--arch/x86/kvm/emulate.c1754
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/paging_tmpl.h83
-rw-r--r--arch/x86/kvm/svm.c585
-rw-r--r--arch/x86/kvm/vmx.c228
-rw-r--r--arch/x86/kvm/x86.c570
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lguest/boot.c6
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S47
-rw-r--r--arch/x86/lib/memmove_64.S29
-rw-r--r--arch/x86/lib/memset_64.S54
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c)21
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c26
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/numa.c581
-rw-r--r--arch/x86/mm/numa_32.c398
-rw-r--r--arch/x86/mm/numa_64.c644
-rw-r--r--arch/x86/mm/numa_emulation.c36
-rw-r--r--arch/x86/mm/numa_internal.h8
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c)82
-rw-r--r--arch/x86/mm/srat_32.c288
-rw-r--r--arch/x86/net/Makefile4
-rw-r--r--arch/x86/net/bpf_jit.S140
-rw-r--r--arch/x86/net/bpf_jit_comp.c654
-rw-r--r--arch/x86/oprofile/backtrace.c13
-rw-r--r--arch/x86/pci/direct.c17
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c10
-rw-r--r--arch/x86/pci/xen.c96
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts8
-rw-r--r--arch/x86/platform/efi/efi.c78
-rw-r--r--arch/x86/platform/efi/efi_64.c34
-rw-r--r--arch/x86/platform/mrst/mrst.c14
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc.c51
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c19
-rw-r--r--arch/x86/platform/uv/tlb_uv.c92
-rw-r--r--arch/x86/platform/uv/uv_time.c6
-rw-r--r--arch/x86/vdso/Makefile17
-rw-r--r--arch/x86/vdso/vclock_gettime.c74
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c27
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c41
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c352
-rw-r--r--arch/x86/xen/mmu.h37
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c2
-rw-r--r--arch/x86/xen/setup.c10
-rw-r--r--arch/x86/xen/smp.c13
-rw-r--r--arch/x86/xen/time.c14
-rw-r--r--arch/x86/xen/xen-ops.h2
237 files changed, 7003 insertions, 15647 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e103236b754..0e9dec6cadd1 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -15,3 +15,4 @@ obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/ 15obj-$(CONFIG_IA32_EMULATION) += ia32/
16 16
17obj-y += platform/ 17obj-y += platform/
18obj-y += net/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bfd..da349723d411 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
8 8
9config X86_32 9config X86_32
10 def_bool !64BIT 10 def_bool !64BIT
11 select CLKSRC_I8253
11 12
12config X86_64 13config X86_64
13 def_bool 64BIT 14 def_bool 64BIT
@@ -16,8 +17,6 @@ config X86_64
16config X86 17config X86
17 def_bool y 18 def_bool y
18 select HAVE_AOUT if X86_32 19 select HAVE_AOUT if X86_32
19 select HAVE_READQ
20 select HAVE_WRITEQ
21 select HAVE_UNSTABLE_SCHED_CLOCK 20 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 21 select HAVE_IDE
23 select HAVE_OPROFILE 22 select HAVE_OPROFILE
@@ -65,13 +64,12 @@ config X86
65 select HAVE_GENERIC_HARDIRQS 64 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 65 select HAVE_SPARSE_IRQ
67 select GENERIC_FIND_FIRST_BIT 66 select GENERIC_FIND_FIRST_BIT
68 select GENERIC_FIND_NEXT_BIT
69 select GENERIC_IRQ_PROBE 67 select GENERIC_IRQ_PROBE
70 select GENERIC_PENDING_IRQ if SMP 68 select GENERIC_PENDING_IRQ if SMP
71 select GENERIC_IRQ_SHOW 69 select GENERIC_IRQ_SHOW
72 select IRQ_FORCED_THREADING 70 select IRQ_FORCED_THREADING
73 select USE_GENERIC_SMP_HELPERS if SMP 71 select USE_GENERIC_SMP_HELPERS if SMP
74 select ARCH_NO_SYSDEV_OPS 72 select HAVE_BPF_JIT if (X86_64 && NET)
75 73
76config INSTRUCTION_DECODER 74config INSTRUCTION_DECODER
77 def_bool (KPROBES || PERF_EVENTS) 75 def_bool (KPROBES || PERF_EVENTS)
@@ -112,7 +110,14 @@ config MMU
112 def_bool y 110 def_bool y
113 111
114config ZONE_DMA 112config ZONE_DMA
115 def_bool y 113 bool "DMA memory allocation support" if EXPERT
114 default y
115 help
116 DMA memory allocation support allows devices with less than 32-bit
117 addressing to allocate within the first 16MB of address space.
118 Disable if no such devices will be used.
119
120 If unsure, say Y.
116 121
117config SBUS 122config SBUS
118 bool 123 bool
@@ -365,17 +370,6 @@ config X86_UV
365# Following is an alphabetically sorted list of 32 bit extended platforms 370# Following is an alphabetically sorted list of 32 bit extended platforms
366# Please maintain the alphabetic order if and when there are additions 371# Please maintain the alphabetic order if and when there are additions
367 372
368config X86_ELAN
369 bool "AMD Elan"
370 depends on X86_32
371 depends on X86_EXTENDED_PLATFORM
372 ---help---
373 Select this for an AMD Elan processor.
374
375 Do not use this option for K6/Athlon/Opteron processors!
376
377 If unsure, choose "PC-compatible" instead.
378
379config X86_INTEL_CE 373config X86_INTEL_CE
380 bool "CE4100 TV platform" 374 bool "CE4100 TV platform"
381 depends on PCI 375 depends on PCI
@@ -690,6 +684,7 @@ config AMD_IOMMU
690 bool "AMD IOMMU support" 684 bool "AMD IOMMU support"
691 select SWIOTLB 685 select SWIOTLB
692 select PCI_MSI 686 select PCI_MSI
687 select PCI_IOV
693 depends on X86_64 && PCI && ACPI 688 depends on X86_64 && PCI && ACPI
694 ---help--- 689 ---help---
695 With this option you can enable support for AMD IOMMU hardware in 690 With this option you can enable support for AMD IOMMU hardware in
@@ -919,6 +914,7 @@ config TOSHIBA
919 914
920config I8K 915config I8K
921 tristate "Dell laptop support" 916 tristate "Dell laptop support"
917 select HWMON
922 ---help--- 918 ---help---
923 This adds a driver to safely access the System Management Mode 919 This adds a driver to safely access the System Management Mode
924 of the CPU on the Dell Inspiron 8000. The System Management Mode 920 of the CPU on the Dell Inspiron 8000. The System Management Mode
@@ -1174,7 +1170,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1174config AMD_NUMA 1170config AMD_NUMA
1175 def_bool y 1171 def_bool y
1176 prompt "Old style AMD Opteron NUMA detection" 1172 prompt "Old style AMD Opteron NUMA detection"
1177 depends on X86_64 && NUMA && PCI 1173 depends on NUMA && PCI
1178 ---help--- 1174 ---help---
1179 Enable AMD NUMA node topology detection. You should say Y here if 1175 Enable AMD NUMA node topology detection. You should say Y here if
1180 you have a multi processor AMD system. This uses an old method to 1176 you have a multi processor AMD system. This uses an old method to
@@ -1201,7 +1197,7 @@ config NODES_SPAN_OTHER_NODES
1201 1197
1202config NUMA_EMU 1198config NUMA_EMU
1203 bool "NUMA emulation" 1199 bool "NUMA emulation"
1204 depends on X86_64 && NUMA 1200 depends on NUMA
1205 ---help--- 1201 ---help---
1206 Enable NUMA emulation. A flat machine will be split 1202 Enable NUMA emulation. A flat machine will be split
1207 into virtual nodes when booted with "numa=fake=N", where N is the 1203 into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1223,6 +1219,10 @@ config HAVE_ARCH_BOOTMEM
1223 def_bool y 1219 def_bool y
1224 depends on X86_32 && NUMA 1220 depends on X86_32 && NUMA
1225 1221
1222config HAVE_ARCH_ALLOC_REMAP
1223 def_bool y
1224 depends on X86_32 && NUMA
1225
1226config ARCH_HAVE_MEMORY_PRESENT 1226config ARCH_HAVE_MEMORY_PRESENT
1227 def_bool y 1227 def_bool y
1228 depends on X86_32 && DISCONTIGMEM 1228 depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1231,9 @@ config NEED_NODE_MEMMAP_SIZE
1231 def_bool y 1231 def_bool y
1232 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 1232 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1233 1233
1234config HAVE_ARCH_ALLOC_REMAP
1235 def_bool y
1236 depends on X86_32 && NUMA
1237
1238config ARCH_FLATMEM_ENABLE 1234config ARCH_FLATMEM_ENABLE
1239 def_bool y 1235 def_bool y
1240 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA 1236 depends on X86_32 && !NUMA
1241 1237
1242config ARCH_DISCONTIGMEM_ENABLE 1238config ARCH_DISCONTIGMEM_ENABLE
1243 def_bool y 1239 def_bool y
@@ -1247,20 +1243,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
1247 def_bool y 1243 def_bool y
1248 depends on NUMA && X86_32 1244 depends on NUMA && X86_32
1249 1245
1250config ARCH_PROC_KCORE_TEXT
1251 def_bool y
1252 depends on X86_64 && PROC_KCORE
1253
1254config ARCH_SPARSEMEM_DEFAULT
1255 def_bool y
1256 depends on X86_64
1257
1258config ARCH_SPARSEMEM_ENABLE 1246config ARCH_SPARSEMEM_ENABLE
1259 def_bool y 1247 def_bool y
1260 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD 1248 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
1261 select SPARSEMEM_STATIC if X86_32 1249 select SPARSEMEM_STATIC if X86_32
1262 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1250 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1263 1251
1252config ARCH_SPARSEMEM_DEFAULT
1253 def_bool y
1254 depends on X86_64
1255
1264config ARCH_SELECT_MEMORY_MODEL 1256config ARCH_SELECT_MEMORY_MODEL
1265 def_bool y 1257 def_bool y
1266 depends on ARCH_SPARSEMEM_ENABLE 1258 depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1261,10 @@ config ARCH_MEMORY_PROBE
1269 def_bool X86_64 1261 def_bool X86_64
1270 depends on MEMORY_HOTPLUG 1262 depends on MEMORY_HOTPLUG
1271 1263
1264config ARCH_PROC_KCORE_TEXT
1265 def_bool y
1266 depends on X86_64 && PROC_KCORE
1267
1272config ILLEGAL_POINTER_VALUE 1268config ILLEGAL_POINTER_VALUE
1273 hex 1269 hex
1274 default 0 if X86_32 1270 default 0 if X86_32
@@ -1703,10 +1699,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
1703 def_bool y 1699 def_bool y
1704 depends on MEMORY_HOTPLUG 1700 depends on MEMORY_HOTPLUG
1705 1701
1706config HAVE_ARCH_EARLY_PFN_TO_NID
1707 def_bool X86_64
1708 depends on NUMA
1709
1710config USE_PERCPU_NUMA_NODE_ID 1702config USE_PERCPU_NUMA_NODE_ID
1711 def_bool y 1703 def_bool y
1712 depends on NUMA 1704 depends on NUMA
@@ -1848,7 +1840,7 @@ config APM_ALLOW_INTS
1848 1840
1849endif # APM 1841endif # APM
1850 1842
1851source "arch/x86/kernel/cpu/cpufreq/Kconfig" 1843source "drivers/cpufreq/Kconfig"
1852 1844
1853source "drivers/cpuidle/Kconfig" 1845source "drivers/cpuidle/Kconfig"
1854 1846
@@ -2076,7 +2068,7 @@ config OLPC
2076 depends on !X86_PAE 2068 depends on !X86_PAE
2077 select GPIOLIB 2069 select GPIOLIB
2078 select OF 2070 select OF
2079 select OF_PROMTREE if PROC_DEVICETREE 2071 select OF_PROMTREE
2080 ---help--- 2072 ---help---
2081 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2082 XO hardware. 2074 XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df62..6a7cfdf8ff69 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
1# Put here option for CPU selection and depending optimization 1# Put here option for CPU selection and depending optimization
2if !X86_ELAN
3
4choice 2choice
5 prompt "Processor family" 3 prompt "Processor family"
6 default M686 if X86_32 4 default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
203 stores for this CPU, which can increase performance of some 201 stores for this CPU, which can increase performance of some
204 operations. 202 operations.
205 203
204config MELAN
205 bool "AMD Elan"
206 depends on X86_32
207 ---help---
208 Select this for an AMD Elan processor.
209
210 Do not use this option for K6/Athlon/Opteron processors!
211
206config MGEODEGX1 212config MGEODEGX1
207 bool "GeodeGX1" 213 bool "GeodeGX1"
208 depends on X86_32 214 depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
292 This is really intended for distributors who need more 298 This is really intended for distributors who need more
293 generic optimizations. 299 generic optimizations.
294 300
295endif
296
297# 301#
298# Define implied options from the CPU selection here 302# Define implied options from the CPU selection here
299config X86_INTERNODE_CACHE_SHIFT 303config X86_INTERNODE_CACHE_SHIFT
@@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT
312 int 316 int
313 default "7" if MPENTIUM4 || MPSC 317 default "7" if MPENTIUM4 || MPSC
314 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU 318 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
315 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 319 default "4" if MELAN || M486 || M386 || MGEODEGX1
316 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 320 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
317 321
318config X86_XADD 322config X86_XADD
@@ -358,7 +362,7 @@ config X86_POPAD_OK
358 362
359config X86_ALIGNMENT_16 363config X86_ALIGNMENT_16
360 def_bool y 364 def_bool y
361 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 365 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
362 366
363config X86_INTEL_USERCOPY 367config X86_INTEL_USERCOPY
364 def_bool y 368 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 615e18810f48..c0f8a5c88910 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW
66 This option will cause messages to be printed if free stack space 66 This option will cause messages to be printed if free stack space
67 drops below a certain limit. 67 drops below a certain limit.
68 68
69config DEBUG_STACK_USAGE
70 bool "Stack utilization instrumentation"
71 depends on DEBUG_KERNEL
72 ---help---
73 Enables the display of the minimum amount of free stack which each
74 task has ever had available in the sysrq-T and sysrq-P debug output.
75
76 This option will slow down process creation somewhat.
77
78config DEBUG_PER_CPU_MAPS
79 bool "Debug access to per_cpu maps"
80 depends on DEBUG_KERNEL
81 depends on SMP
82 ---help---
83 Say Y to verify that the per_cpu map being accessed has
84 been setup. Adds a fair amount of code to kernel memory
85 and decreases performance.
86
87 Say N if unsure.
88
89config X86_PTDUMP 69config X86_PTDUMP
90 bool "Export kernel pagetable layout to userspace via debugfs" 70 bool "Export kernel pagetable layout to userspace via debugfs"
91 depends on DEBUG_KERNEL 71 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df9..86cee7b749e1 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) 37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
38 38
39# AMD Elan support 39# AMD Elan support
40cflags-$(CONFIG_X86_ELAN) += -march=i486 40cflags-$(CONFIG_MELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035e..db75d07c3645 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -91,7 +91,7 @@ static int detect_memory_e801(void)
91 if (oreg.ax > 15*1024) { 91 if (oreg.ax > 15*1024) {
92 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
93 } else if (oreg.ax == 15*1024) { 93 } else if (oreg.ax == 15*1024) {
94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; 94 boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
95 } else { 95 } else {
96 /* 96 /*
97 * This ignores memory above 16MB if we have a memory 97 * This ignores memory above 16MB if we have a memory
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6f9872658dd2..2bf18059fbea 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
10CONFIG_AUDIT=y 10CONFIG_AUDIT=y
11CONFIG_LOG_BUF_SHIFT=18 11CONFIG_LOG_BUF_SHIFT=18
12CONFIG_CGROUPS=y 12CONFIG_CGROUPS=y
13CONFIG_CGROUP_NS=y
14CONFIG_CGROUP_FREEZER=y 13CONFIG_CGROUP_FREEZER=y
15CONFIG_CPUSETS=y 14CONFIG_CPUSETS=y
16CONFIG_CGROUP_CPUACCT=y 15CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ee01a9d5d4f0..22a0dc8e51dd 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
11CONFIG_AUDIT=y 11CONFIG_AUDIT=y
12CONFIG_LOG_BUF_SHIFT=18 12CONFIG_LOG_BUF_SHIFT=18
13CONFIG_CGROUPS=y 13CONFIG_CGROUPS=y
14CONFIG_CGROUP_NS=y
15CONFIG_CGROUP_FREEZER=y 14CONFIG_CGROUP_FREEZER=y
16CONFIG_CPUSETS=y 15CONFIG_CPUSETS=y
17CONFIG_CGROUP_CPUACCT=y 16CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf7..c04f1b7a9139 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
24twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 22twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 23salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
26 24
27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
28 26
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2577613fb32b..feee8ff1d05e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
94 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
96 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97
98int crypto_fpu_init(void);
99void crypto_fpu_exit(void);
100
97#ifdef CONFIG_X86_64 101#ifdef CONFIG_X86_64
98asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 102asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
99 const u8 *in, unsigned int len, u8 *iv); 103 const u8 *in, unsigned int len, u8 *iv);
@@ -1257,6 +1261,8 @@ static int __init aesni_init(void)
1257 return -ENODEV; 1261 return -ENODEV;
1258 } 1262 }
1259 1263
1264 if ((err = crypto_fpu_init()))
1265 goto fpu_err;
1260 if ((err = crypto_register_alg(&aesni_alg))) 1266 if ((err = crypto_register_alg(&aesni_alg)))
1261 goto aes_err; 1267 goto aes_err;
1262 if ((err = crypto_register_alg(&__aesni_alg))) 1268 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -1334,6 +1340,7 @@ blk_ecb_err:
1334__aes_err: 1340__aes_err:
1335 crypto_unregister_alg(&aesni_alg); 1341 crypto_unregister_alg(&aesni_alg);
1336aes_err: 1342aes_err:
1343fpu_err:
1337 return err; 1344 return err;
1338} 1345}
1339 1346
@@ -1363,6 +1370,8 @@ static void __exit aesni_exit(void)
1363 crypto_unregister_alg(&blk_ecb_alg); 1370 crypto_unregister_alg(&blk_ecb_alg);
1364 crypto_unregister_alg(&__aesni_alg); 1371 crypto_unregister_alg(&__aesni_alg);
1365 crypto_unregister_alg(&aesni_alg); 1372 crypto_unregister_alg(&aesni_alg);
1373
1374 crypto_fpu_exit();
1366} 1375}
1367 1376
1368module_init(aesni_init); 1377module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c035..98d7a188f46b 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
150 .module = THIS_MODULE, 150 .module = THIS_MODULE,
151}; 151};
152 152
153static int __init crypto_fpu_module_init(void) 153int __init crypto_fpu_init(void)
154{ 154{
155 return crypto_register_template(&crypto_fpu_tmpl); 155 return crypto_register_template(&crypto_fpu_tmpl);
156} 156}
157 157
158static void __exit crypto_fpu_module_exit(void) 158void __exit crypto_fpu_exit(void)
159{ 159{
160 crypto_unregister_template(&crypto_fpu_tmpl); 160 crypto_unregister_template(&crypto_fpu_tmpl);
161} 161}
162
163module_init(crypto_fpu_module_init);
164module_exit(crypto_fpu_module_exit);
165
166MODULE_LICENSE("GPL");
167MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d23c71d..95f5826be458 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -848,4 +848,5 @@ ia32_sys_call_table:
848 .quad compat_sys_open_by_handle_at 848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime 849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs 850 .quad sys_syncfs
851 .quad compat_sys_sendmmsg /* 345 */
851ia32_syscall_end: 852ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869c..416d865eae39 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
183 183
184#define ARCH_HAS_POWER_INIT 1 184#define ARCH_HAS_POWER_INIT 1
185 185
186struct bootnode;
187
188#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
189extern int acpi_numa; 187extern int acpi_numa;
190extern int x86_acpi_numa_init(void); 188extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cce..94d420b360d1 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
15 .endm 15 .endm
16#endif 16#endif
17 17
18.macro altinstruction_entry orig alt feature orig_len alt_len
19 .align 8
20 .quad \orig
21 .quad \alt
22 .word \feature
23 .byte \orig_len
24 .byte \alt_len
25.endm
26
18#endif /* __ASSEMBLY__ */ 27#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99a..bf535f947e8c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,7 +4,6 @@
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/jump_label.h>
8#include <asm/asm.h> 7#include <asm/asm.h>
9 8
10/* 9/*
@@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
191extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 190extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
192extern void text_poke_smp_batch(struct text_poke_param *params, int n); 191extern void text_poke_smp_batch(struct text_poke_param *params, int n);
193 192
194#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
195#define IDEAL_NOP_SIZE_5 5
196extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
197extern void arch_init_ideal_nop5(void);
198#else
199static inline void arch_init_ideal_nop5(void) {}
200#endif
201
202#endif /* _ASM_X86_ALTERNATIVE_H */ 193#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 916bc8111a01..55d95eb789b3 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -19,13 +19,12 @@
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H 19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H 20#define _ASM_X86_AMD_IOMMU_PROTO_H
21 21
22struct amd_iommu; 22#include <asm/amd_iommu_types.h>
23 23
24extern int amd_iommu_init_dma_ops(void); 24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void); 25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 27extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid); 28extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); 29extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void); 30extern int amd_iommu_init_devices(void);
@@ -44,4 +43,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev)
44 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); 43 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
45} 44}
46 45
46static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
47{
48 if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
49 return false;
50
51 return !!(iommu->features & f);
52}
53
47#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ 54#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index e3509fc303bf..4c9982995414 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -68,12 +68,25 @@
68#define MMIO_CONTROL_OFFSET 0x0018 68#define MMIO_CONTROL_OFFSET 0x0018
69#define MMIO_EXCL_BASE_OFFSET 0x0020 69#define MMIO_EXCL_BASE_OFFSET 0x0020
70#define MMIO_EXCL_LIMIT_OFFSET 0x0028 70#define MMIO_EXCL_LIMIT_OFFSET 0x0028
71#define MMIO_EXT_FEATURES 0x0030
71#define MMIO_CMD_HEAD_OFFSET 0x2000 72#define MMIO_CMD_HEAD_OFFSET 0x2000
72#define MMIO_CMD_TAIL_OFFSET 0x2008 73#define MMIO_CMD_TAIL_OFFSET 0x2008
73#define MMIO_EVT_HEAD_OFFSET 0x2010 74#define MMIO_EVT_HEAD_OFFSET 0x2010
74#define MMIO_EVT_TAIL_OFFSET 0x2018 75#define MMIO_EVT_TAIL_OFFSET 0x2018
75#define MMIO_STATUS_OFFSET 0x2020 76#define MMIO_STATUS_OFFSET 0x2020
76 77
78
79/* Extended Feature Bits */
80#define FEATURE_PREFETCH (1ULL<<0)
81#define FEATURE_PPR (1ULL<<1)
82#define FEATURE_X2APIC (1ULL<<2)
83#define FEATURE_NX (1ULL<<3)
84#define FEATURE_GT (1ULL<<4)
85#define FEATURE_IA (1ULL<<6)
86#define FEATURE_GA (1ULL<<7)
87#define FEATURE_HE (1ULL<<8)
88#define FEATURE_PC (1ULL<<9)
89
77/* MMIO status bits */ 90/* MMIO status bits */
78#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 91#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
79 92
@@ -113,7 +126,9 @@
113/* command specific defines */ 126/* command specific defines */
114#define CMD_COMPL_WAIT 0x01 127#define CMD_COMPL_WAIT 0x01
115#define CMD_INV_DEV_ENTRY 0x02 128#define CMD_INV_DEV_ENTRY 0x02
116#define CMD_INV_IOMMU_PAGES 0x03 129#define CMD_INV_IOMMU_PAGES 0x03
130#define CMD_INV_IOTLB_PAGES 0x04
131#define CMD_INV_ALL 0x08
117 132
118#define CMD_COMPL_WAIT_STORE_MASK 0x01 133#define CMD_COMPL_WAIT_STORE_MASK 0x01
119#define CMD_COMPL_WAIT_INT_MASK 0x02 134#define CMD_COMPL_WAIT_INT_MASK 0x02
@@ -215,6 +230,8 @@
215#define IOMMU_PTE_IR (1ULL << 61) 230#define IOMMU_PTE_IR (1ULL << 61)
216#define IOMMU_PTE_IW (1ULL << 62) 231#define IOMMU_PTE_IW (1ULL << 62)
217 232
233#define DTE_FLAG_IOTLB 0x01
234
218#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 235#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
219#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 236#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
220#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 237#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -227,6 +244,7 @@
227/* IOMMU capabilities */ 244/* IOMMU capabilities */
228#define IOMMU_CAP_IOTLB 24 245#define IOMMU_CAP_IOTLB 24
229#define IOMMU_CAP_NPCACHE 26 246#define IOMMU_CAP_NPCACHE 26
247#define IOMMU_CAP_EFR 27
230 248
231#define MAX_DOMAIN_ID 65536 249#define MAX_DOMAIN_ID 65536
232 250
@@ -249,6 +267,8 @@ extern bool amd_iommu_dump;
249 267
250/* global flag if IOMMUs cache non-present entries */ 268/* global flag if IOMMUs cache non-present entries */
251extern bool amd_iommu_np_cache; 269extern bool amd_iommu_np_cache;
270/* Only true if all IOMMUs support device IOTLBs */
271extern bool amd_iommu_iotlb_sup;
252 272
253/* 273/*
254 * Make iterating over all IOMMUs easier 274 * Make iterating over all IOMMUs easier
@@ -371,6 +391,9 @@ struct amd_iommu {
371 /* flags read from acpi table */ 391 /* flags read from acpi table */
372 u8 acpi_flags; 392 u8 acpi_flags;
373 393
394 /* Extended features */
395 u64 features;
396
374 /* 397 /*
375 * Capability pointer. There could be more than one IOMMU per PCI 398 * Capability pointer. There could be more than one IOMMU per PCI
376 * device function if there are more than one AMD IOMMU capability 399 * device function if there are more than one AMD IOMMU capability
@@ -409,9 +432,6 @@ struct amd_iommu {
409 /* if one, we need to send a completion wait command */ 432 /* if one, we need to send a completion wait command */
410 bool need_sync; 433 bool need_sync;
411 434
412 /* becomes true if a command buffer reset is running */
413 bool reset_in_progress;
414
415 /* default dma_ops domain for that IOMMU */ 435 /* default dma_ops domain for that IOMMU */
416 struct dma_ops_domain *default_dom; 436 struct dma_ops_domain *default_dom;
417 437
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb4..67f87f257611 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
11 11
12extern const struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode;
15 14
16extern bool early_is_amd_nb(u32 value); 15extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 16extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be549..4a0b7c7e2cce 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -363,7 +363,12 @@ struct apic {
363 */ 363 */
364 int (*x86_32_early_logical_apicid)(int cpu); 364 int (*x86_32_early_logical_apicid)(int cpu);
365 365
366 /* determine CPU -> NUMA node mapping */ 366 /*
367 * Optional method called from setup_local_APIC() after logical
368 * apicid is guaranteed to be known to initialize apicid -> node
369 * mapping if NUMA initialization hasn't done so already. Don't
370 * add new users.
371 */
367 int (*x86_32_numa_cpu_node)(int cpu); 372 int (*x86_32_numa_cpu_node)(int cpu);
368#endif 373#endif
369}; 374};
@@ -376,6 +381,26 @@ struct apic {
376extern struct apic *apic; 381extern struct apic *apic;
377 382
378/* 383/*
384 * APIC drivers are probed based on how they are listed in the .apicdrivers
385 * section. So the order is important and enforced by the ordering
386 * of different apic driver files in the Makefile.
387 *
388 * For the files having two apic drivers, we use apic_drivers()
389 * to enforce the order with in them.
390 */
391#define apic_driver(sym) \
392 static struct apic *__apicdrivers_##sym __used \
393 __aligned(sizeof(struct apic *)) \
394 __section(.apicdrivers) = { &sym }
395
396#define apic_drivers(sym1, sym2) \
397 static struct apic *__apicdrivers_##sym1##sym2[2] __used \
398 __aligned(sizeof(struct apic *)) \
399 __section(.apicdrivers) = { &sym1, &sym2 }
400
401extern struct apic *__apicdrivers[], *__apicdrivers_end[];
402
403/*
379 * APIC functionality to boot other CPUs - only used on SMP: 404 * APIC functionality to boot other CPUs - only used on SMP:
380 */ 405 */
381#ifdef CONFIG_SMP 406#ifdef CONFIG_SMP
@@ -453,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x)
453#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 478#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
454 479
455#ifdef CONFIG_X86_64 480#ifdef CONFIG_X86_64
456extern struct apic apic_flat;
457extern struct apic apic_physflat;
458extern struct apic apic_x2apic_cluster;
459extern struct apic apic_x2apic_phys;
460extern int default_acpi_madt_oem_check(char *, char *); 481extern int default_acpi_madt_oem_check(char *, char *);
461 482
462extern void apic_send_IPI_self(int vector); 483extern void apic_send_IPI_self(int vector);
463 484
464extern struct apic apic_x2apic_uv_x;
465DECLARE_PER_CPU(int, x2apic_extra_bits); 485DECLARE_PER_CPU(int, x2apic_extra_bits);
466 486
467extern int default_cpu_present_to_apicid(int mps_cpu); 487extern int default_cpu_present_to_apicid(int mps_cpu);
@@ -475,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
475 return; 495 return;
476} 496}
477 497
478extern void generic_bigsmp_probe(void); 498extern struct apic *generic_bigsmp_probe(void);
479 499
480 500
481#ifdef CONFIG_X86_LOCAL_APIC 501#ifdef CONFIG_X86_LOCAL_APIC
@@ -511,8 +531,6 @@ extern struct apic apic_noop;
511 531
512#ifdef CONFIG_X86_32 532#ifdef CONFIG_X86_32
513 533
514extern struct apic apic_default;
515
516static inline int noop_x86_32_early_logical_apicid(int cpu) 534static inline int noop_x86_32_early_logical_apicid(int cpu)
517{ 535{
518 return BAD_APICID; 536 return BAD_APICID;
@@ -537,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
537 return cpuid_apic >> index_msb; 555 return cpuid_apic >> index_msb;
538} 556}
539 557
540extern int default_x86_32_numa_cpu_node(int cpu);
541
542#endif 558#endif
543 559
544static inline unsigned int 560static inline unsigned int
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index d87988bacf3e..34595d5e1038 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -78,6 +78,7 @@
78#define APIC_DEST_LOGICAL 0x00800 78#define APIC_DEST_LOGICAL 0x00800
79#define APIC_DEST_PHYSICAL 0x00000 79#define APIC_DEST_PHYSICAL 0x00000
80#define APIC_DM_FIXED 0x00000 80#define APIC_DM_FIXED 0x00000
81#define APIC_DM_FIXED_MASK 0x00700
81#define APIC_DM_LOWEST 0x00100 82#define APIC_DM_LOWEST 0x00100
82#define APIC_DM_SMI 0x00200 83#define APIC_DM_SMI 0x00200
83#define APIC_DM_REMRD 0x00300 84#define APIC_DM_REMRD 0x00300
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3f..aa6a3170ab5a 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,16 +4,40 @@
4#include <asm/io.h> 4#include <asm/io.h>
5 5
6/* 6/*
7 * there is a real-mode segmented pointer pointing to the 7 * Returns physical address of EBDA. Returns 0 if there is no EBDA.
8 * 4K EBDA area at 0x40E.
9 */ 8 */
10static inline unsigned int get_bios_ebda(void) 9static inline unsigned int get_bios_ebda(void)
11{ 10{
11 /*
12 * There is a real-mode segmented pointer pointing to the
13 * 4K EBDA area at 0x40E.
14 */
12 unsigned int address = *(unsigned short *)phys_to_virt(0x40E); 15 unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
13 address <<= 4; 16 address <<= 4;
14 return address; /* 0 means none */ 17 return address; /* 0 means none */
15} 18}
16 19
20/*
21 * Return the sanitized length of the EBDA in bytes, if it exists.
22 */
23static inline unsigned int get_bios_ebda_length(void)
24{
25 unsigned int address;
26 unsigned int length;
27
28 address = get_bios_ebda();
29 if (!address)
30 return 0;
31
32 /* EBDA length is byte 0 of the EBDA (stored in KiB) */
33 length = *(unsigned char *)phys_to_virt(address);
34 length <<= 10;
35
36 /* Trim the length if it extends beyond 640KiB */
37 length = min_t(unsigned int, (640 * 1024) - address, length);
38 return length;
39}
40
17void reserve_ebda_region(void); 41void reserve_ebda_region(void);
18 42
19#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 43#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf21..5dc6acc98dbd 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,8 @@
195 195
196/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 196/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
197#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 197#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
198#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
199#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
198 200
199#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 201#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
200 202
@@ -207,8 +209,7 @@ extern const char * const x86_power_flags[32];
207#define test_cpu_cap(c, bit) \ 209#define test_cpu_cap(c, bit) \
208 test_bit(bit, (unsigned long *)((c)->x86_capability)) 210 test_bit(bit, (unsigned long *)((c)->x86_capability))
209 211
210#define cpu_has(c, bit) \ 212#define REQUIRED_MASK_BIT_SET(bit) \
211 (__builtin_constant_p(bit) && \
212 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 213 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
213 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 214 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
214 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 215 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -218,10 +219,16 @@ extern const char * const x86_power_flags[32];
218 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 219 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
219 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 220 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
220 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 221 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
221 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ 222 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
222 ? 1 : \ 223
224#define cpu_has(c, bit) \
225 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
223 test_cpu_cap(c, bit)) 226 test_cpu_cap(c, bit))
224 227
228#define this_cpu_has(bit) \
229 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
230 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
231
225#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) 232#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
226 233
227#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) 234#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5faba..0bdb0c54d9a1 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,22 +69,18 @@
69 69
70#define MAX_DMA_CHANNELS 8 70#define MAX_DMA_CHANNELS 8
71 71
72#ifdef CONFIG_X86_32
73
74/* The maximum address that we can perform a DMA transfer to on this platform */
75#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
76
77#else
78
79/* 16MB ISA DMA zone */ 72/* 16MB ISA DMA zone */
80#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) 73#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
81 74
82/* 4GB broken PCI/AGP hardware bus master zone */ 75/* 4GB broken PCI/AGP hardware bus master zone */
83#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) 76#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
84 77
78#ifdef CONFIG_X86_32
79/* The maximum address that we can perform a DMA transfer to on this platform */
80#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
81#else
85/* Compat define for old dma zone */ 82/* Compat define for old dma zone */
86#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) 83#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
87
88#endif 84#endif
89 85
90/* 8237 DMA controllers */ 86/* 8237 DMA controllers */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8e4a16508d4e..7093e4a6a0bc 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
90#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
91 91
92extern int add_efi_memmap; 92extern int add_efi_memmap;
93extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
93extern void efi_memblock_x86_reserve_range(void); 94extern void efi_memblock_x86_reserve_range(void);
94extern void efi_call_phys_prelog(void); 95extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void); 96extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be0..268c783ab1c0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
38static inline unsigned long ftrace_call_adjust(unsigned long addr) 38static inline unsigned long ftrace_call_adjust(unsigned long addr)
39{ 39{
40 /* 40 /*
41 * call mcount is "e8 <4 byte offset>" 41 * addr is the address of the mcount call instruction.
42 * The addr points to the 4 byte offset and the caller of this 42 * recordmcount does the necessary offset calculation.
43 * function wants the pointer to e8. Simply subtract one.
44 */ 43 */
45 return addr - 1; 44 return addr;
46} 45}
47 46
48#ifdef CONFIG_DYNAMIC_FTRACE 47#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 43085bfc99c3..156cd5d18d2a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
66 * Don't enable translation but enable GART IO and CPU accesses. 66 * Don't enable translation but enable GART IO and CPU accesses.
67 * Also, set DISTLBWALKPRB since GART tables memory is UC. 67 * Also, set DISTLBWALKPRB since GART tables memory is UC.
68 */ 68 */
69 ctl = DISTLBWALKPRB | order << 1; 69 ctl = order << 1;
70 70
71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
72} 72}
@@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
75{ 75{
76 u32 tmp, ctl; 76 u32 tmp, ctl;
77 77
78 /* address of the mappings table */ 78 /* address of the mappings table */
79 addr >>= 12; 79 addr >>= 12;
80 tmp = (u32) addr<<4; 80 tmp = (u32) addr<<4;
81 tmp &= ~0xf; 81 tmp &= ~0xf;
82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); 82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
83 83
84 /* Enable GART translation for this hammer. */ 84 /* Enable GART translation for this hammer. */
85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
86 ctl |= GARTEN; 86 ctl |= GARTEN | DISTLBWALKPRB;
87 ctl &= ~(DISGARTCPU | DISGARTIO); 87 ctl &= ~(DISGARTCPU | DISGARTIO);
88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
89} 89}
90 90
91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) 91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index fc1f579fb965..65aaa91d5850 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,6 +6,8 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9#define PIT_LATCH LATCH
10
9extern raw_spinlock_t i8253_lock; 11extern raw_spinlock_t i8253_lock;
10 12
11extern struct clock_event_device *global_clock_event; 13extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 072273082528..d02804d650c4 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -38,7 +38,6 @@
38 38
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/compiler.h> 40#include <linux/compiler.h>
41#include <asm-generic/int-ll64.h>
42#include <asm/page.h> 41#include <asm/page.h>
43 42
44#include <xen/xen.h> 43#include <xen/xen.h>
@@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
87build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 86build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
88build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 87build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
89 88
90#else
91
92static inline __u64 readq(const volatile void __iomem *addr)
93{
94 const volatile u32 __iomem *p = addr;
95 u32 low, high;
96
97 low = readl(p);
98 high = readl(p + 1);
99
100 return low + ((u64)high << 32);
101}
102
103static inline void writeq(__u64 val, volatile void __iomem *addr)
104{
105 writel(val, addr);
106 writel(val >> 32, addr+4);
107}
108
109#endif
110
111#define readq_relaxed(a) readq(a) 89#define readq_relaxed(a) readq(a)
112 90
113#define __raw_readq(a) readq(a) 91#define __raw_readq(a) readq(a)
@@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
117#define readq readq 95#define readq readq
118#define writeq writeq 96#define writeq writeq
119 97
98#endif
99
120/** 100/**
121 * virt_to_phys - map virtual addresses to physical 101 * virt_to_phys - map virtual addresses to physical
122 * @address: address to remap 102 * @address: address to remap
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index c4bd267dfc50..690d1cc9a877 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry {
105 * # of IO-APICs and # of IRQ routing registers 105 * # of IO-APICs and # of IRQ routing registers
106 */ 106 */
107extern int nr_ioapics; 107extern int nr_ioapics;
108extern int nr_ioapic_registers[MAX_IO_APICS];
109 108
110#define MP_MAX_IOAPIC_PIN 127 109extern int mpc_ioapic_id(int ioapic);
110extern unsigned int mpc_ioapic_addr(int ioapic);
111extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
111 112
112/* I/O APIC entries */ 113#define MP_MAX_IOAPIC_PIN 127
113extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
114 114
115/* # of MP IRQ source entries */ 115/* # of MP IRQ source entries */
116extern int mp_irq_entries; 116extern int mp_irq_entries;
@@ -150,13 +150,11 @@ void setup_IO_APIC_irq_extra(u32 gsi);
150extern void ioapic_and_gsi_init(void); 150extern void ioapic_and_gsi_init(void);
151extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
152 152
153int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr); 153int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154 154
155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern int save_ioapic_entries(void);
156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void mask_ioapic_entries(void);
157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int restore_ioapic_entries(void);
158extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
159extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
160 158
161extern int get_nr_irqs_gsi(void); 159extern int get_nr_irqs_gsi(void);
162 160
@@ -192,19 +190,13 @@ struct io_apic_irq_attr;
192static inline int io_apic_set_pci_routing(struct device *dev, int irq, 190static inline int io_apic_set_pci_routing(struct device *dev, int irq,
193 struct io_apic_irq_attr *irq_attr) { return 0; } 191 struct io_apic_irq_attr *irq_attr) { return 0; }
194 192
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) 193static inline int save_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{ 194{
203 return -ENOMEM; 195 return -ENOMEM;
204} 196}
205 197
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } 198static inline void mask_ioapic_entries(void) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) 199static inline int restore_ioapic_entries(void)
208{ 200{
209 return -ENOMEM; 201 return -ENOMEM;
210} 202}
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 574dbc22893a..a32b18ce6ead 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,20 +5,25 @@
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <asm/nops.h> 7#include <asm/nops.h>
8#include <asm/asm.h>
8 9
9#define JUMP_LABEL_NOP_SIZE 5 10#define JUMP_LABEL_NOP_SIZE 5
10 11
11# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" 12#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
12 13
13# define JUMP_LABEL(key, label) \ 14static __always_inline bool arch_static_branch(struct jump_label_key *key)
14 do { \ 15{
15 asm goto("1:" \ 16 asm goto("1:"
16 JUMP_LABEL_INITIAL_NOP \ 17 JUMP_LABEL_INITIAL_NOP
17 ".pushsection __jump_table, \"aw\" \n\t"\ 18 ".pushsection __jump_table, \"aw\" \n\t"
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ 19 _ASM_ALIGN "\n\t"
19 ".popsection \n\t" \ 20 _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
20 : : "i" (key) : : label); \ 21 ".popsection \n\t"
21 } while (0) 22 : : "i" (key) : : l_yes);
23 return false;
24l_yes:
25 return true;
26}
22 27
23#endif /* __KERNEL__ */ 28#endif /* __KERNEL__ */
24 29
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 396f5b5fc4d7..77e95f54570a 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void)
77} 77}
78#define BREAK_INSTR_SIZE 1 78#define BREAK_INSTR_SIZE 1
79#define CACHE_FLUSH_IS_SAFE 1 79#define CACHE_FLUSH_IS_SAFE 1
80#define GDB_ADJUSTS_BREAK_OFFSET
80 81
81extern int kgdb_ll_trap(int cmd, const char *str, 82extern int kgdb_ll_trap(int cmd, const char *str,
82 struct pt_regs *regs, long err, int trap, int sig); 83 struct pt_regs *regs, long err, int trap, int sig);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f5213564326..0049211959c0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,8 @@
14#include <asm/desc_defs.h> 14#include <asm/desc_defs.h>
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17enum x86_intercept;
18enum x86_intercept_stage;
17 19
18struct x86_exception { 20struct x86_exception {
19 u8 vector; 21 u8 vector;
@@ -24,6 +26,24 @@ struct x86_exception {
24}; 26};
25 27
26/* 28/*
29 * This struct is used to carry enough information from the instruction
30 * decoder to main KVM so that a decision can be made whether the
31 * instruction needs to be intercepted or not.
32 */
33struct x86_instruction_info {
34 u8 intercept; /* which intercept */
35 u8 rep_prefix; /* rep prefix? */
36 u8 modrm_mod; /* mod part of modrm */
37 u8 modrm_reg; /* index of register used */
38 u8 modrm_rm; /* rm part of modrm */
39 u64 src_val; /* value of source operand */
40 u8 src_bytes; /* size of source operand */
41 u8 dst_bytes; /* size of destination operand */
42 u8 ad_bytes; /* size of src/dst address */
43 u64 next_rip; /* rip following the instruction */
44};
45
46/*
27 * x86_emulate_ops: 47 * x86_emulate_ops:
28 * 48 *
29 * These operations represent the instruction emulator's interface to memory. 49 * These operations represent the instruction emulator's interface to memory.
@@ -62,6 +82,7 @@ struct x86_exception {
62#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ 82#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
63#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ 83#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
64#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ 84#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
85#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
65 86
66struct x86_emulate_ops { 87struct x86_emulate_ops {
67 /* 88 /*
@@ -71,8 +92,9 @@ struct x86_emulate_ops {
71 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 92 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
73 */ 94 */
74 int (*read_std)(unsigned long addr, void *val, 95 int (*read_std)(struct x86_emulate_ctxt *ctxt,
75 unsigned int bytes, struct kvm_vcpu *vcpu, 96 unsigned long addr, void *val,
97 unsigned int bytes,
76 struct x86_exception *fault); 98 struct x86_exception *fault);
77 99
78 /* 100 /*
@@ -82,8 +104,8 @@ struct x86_emulate_ops {
82 * @val: [OUT] Value write to memory, zero-extended to 'u_long'. 104 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
83 * @bytes: [IN ] Number of bytes to write to memory. 105 * @bytes: [IN ] Number of bytes to write to memory.
84 */ 106 */
85 int (*write_std)(unsigned long addr, void *val, 107 int (*write_std)(struct x86_emulate_ctxt *ctxt,
86 unsigned int bytes, struct kvm_vcpu *vcpu, 108 unsigned long addr, void *val, unsigned int bytes,
87 struct x86_exception *fault); 109 struct x86_exception *fault);
88 /* 110 /*
89 * fetch: Read bytes of standard (non-emulated/special) memory. 111 * fetch: Read bytes of standard (non-emulated/special) memory.
@@ -92,8 +114,8 @@ struct x86_emulate_ops {
92 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 114 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
93 * @bytes: [IN ] Number of bytes to read from memory. 115 * @bytes: [IN ] Number of bytes to read from memory.
94 */ 116 */
95 int (*fetch)(unsigned long addr, void *val, 117 int (*fetch)(struct x86_emulate_ctxt *ctxt,
96 unsigned int bytes, struct kvm_vcpu *vcpu, 118 unsigned long addr, void *val, unsigned int bytes,
97 struct x86_exception *fault); 119 struct x86_exception *fault);
98 120
99 /* 121 /*
@@ -102,11 +124,9 @@ struct x86_emulate_ops {
102 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 124 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
103 * @bytes: [IN ] Number of bytes to read from memory. 125 * @bytes: [IN ] Number of bytes to read from memory.
104 */ 126 */
105 int (*read_emulated)(unsigned long addr, 127 int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
106 void *val, 128 unsigned long addr, void *val, unsigned int bytes,
107 unsigned int bytes, 129 struct x86_exception *fault);
108 struct x86_exception *fault,
109 struct kvm_vcpu *vcpu);
110 130
111 /* 131 /*
112 * write_emulated: Write bytes to emulated/special memory area. 132 * write_emulated: Write bytes to emulated/special memory area.
@@ -115,11 +135,10 @@ struct x86_emulate_ops {
115 * required). 135 * required).
116 * @bytes: [IN ] Number of bytes to write to memory. 136 * @bytes: [IN ] Number of bytes to write to memory.
117 */ 137 */
118 int (*write_emulated)(unsigned long addr, 138 int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
119 const void *val, 139 unsigned long addr, const void *val,
120 unsigned int bytes, 140 unsigned int bytes,
121 struct x86_exception *fault, 141 struct x86_exception *fault);
122 struct kvm_vcpu *vcpu);
123 142
124 /* 143 /*
125 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an 144 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -129,40 +148,54 @@ struct x86_emulate_ops {
129 * @new: [IN ] Value to write to @addr. 148 * @new: [IN ] Value to write to @addr.
130 * @bytes: [IN ] Number of bytes to access using CMPXCHG. 149 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
131 */ 150 */
132 int (*cmpxchg_emulated)(unsigned long addr, 151 int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
152 unsigned long addr,
133 const void *old, 153 const void *old,
134 const void *new, 154 const void *new,
135 unsigned int bytes, 155 unsigned int bytes,
136 struct x86_exception *fault, 156 struct x86_exception *fault);
137 struct kvm_vcpu *vcpu); 157 void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
138 158
139 int (*pio_in_emulated)(int size, unsigned short port, void *val, 159 int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
140 unsigned int count, struct kvm_vcpu *vcpu); 160 int size, unsigned short port, void *val,
141 161 unsigned int count);
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 162
143 unsigned int count, struct kvm_vcpu *vcpu); 163 int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
144 164 int size, unsigned short port, const void *val,
145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, 165 unsigned int count);
146 int seg, struct kvm_vcpu *vcpu); 166
147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, 167 bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
148 int seg, struct kvm_vcpu *vcpu); 168 struct desc_struct *desc, u32 *base3, int seg);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 169 void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 170 struct desc_struct *desc, u32 base3, int seg);
151 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 171 unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
152 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 172 int seg);
153 void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 173 void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
154 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 174 void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
155 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 175 void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
156 int (*cpl)(struct kvm_vcpu *vcpu); 176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
157 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); 177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
158 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); 178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
159 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 179 int (*cpl)(struct x86_emulate_ctxt *ctxt);
160 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 180 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
184 void (*halt)(struct x86_emulate_ctxt *ctxt);
185 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
186 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
187 void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
188 void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
189 int (*intercept)(struct x86_emulate_ctxt *ctxt,
190 struct x86_instruction_info *info,
191 enum x86_intercept_stage stage);
161}; 192};
162 193
194typedef u32 __attribute__((vector_size(16))) sse128_t;
195
163/* Type, address-of, and value of an instruction's operand. */ 196/* Type, address-of, and value of an instruction's operand. */
164struct operand { 197struct operand {
165 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 198 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
166 unsigned int bytes; 199 unsigned int bytes;
167 union { 200 union {
168 unsigned long orig_val; 201 unsigned long orig_val;
@@ -174,11 +207,13 @@ struct operand {
174 ulong ea; 207 ulong ea;
175 unsigned seg; 208 unsigned seg;
176 } mem; 209 } mem;
210 unsigned xmm;
177 } addr; 211 } addr;
178 union { 212 union {
179 unsigned long val; 213 unsigned long val;
180 u64 val64; 214 u64 val64;
181 char valptr[sizeof(unsigned long) + 2]; 215 char valptr[sizeof(unsigned long) + 2];
216 sse128_t vec_val;
182 }; 217 };
183}; 218};
184 219
@@ -197,6 +232,7 @@ struct read_cache {
197struct decode_cache { 232struct decode_cache {
198 u8 twobyte; 233 u8 twobyte;
199 u8 b; 234 u8 b;
235 u8 intercept;
200 u8 lock_prefix; 236 u8 lock_prefix;
201 u8 rep_prefix; 237 u8 rep_prefix;
202 u8 op_bytes; 238 u8 op_bytes;
@@ -209,6 +245,7 @@ struct decode_cache {
209 u8 seg_override; 245 u8 seg_override;
210 unsigned int d; 246 unsigned int d;
211 int (*execute)(struct x86_emulate_ctxt *ctxt); 247 int (*execute)(struct x86_emulate_ctxt *ctxt);
248 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
212 unsigned long regs[NR_VCPU_REGS]; 249 unsigned long regs[NR_VCPU_REGS];
213 unsigned long eip; 250 unsigned long eip;
214 /* modrm */ 251 /* modrm */
@@ -227,17 +264,15 @@ struct x86_emulate_ctxt {
227 struct x86_emulate_ops *ops; 264 struct x86_emulate_ops *ops;
228 265
229 /* Register state before/after emulation. */ 266 /* Register state before/after emulation. */
230 struct kvm_vcpu *vcpu;
231
232 unsigned long eflags; 267 unsigned long eflags;
233 unsigned long eip; /* eip before instruction emulation */ 268 unsigned long eip; /* eip before instruction emulation */
234 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
235 int mode; 270 int mode;
236 u32 cs_base;
237 271
238 /* interruptibility state, as a result of execution of STI or MOV SS */ 272 /* interruptibility state, as a result of execution of STI or MOV SS */
239 int interruptibility; 273 int interruptibility;
240 274
275 bool guest_mode; /* guest running a nested guest */
241 bool perm_ok; /* do not check permissions if true */ 276 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn; 277 bool only_vendor_specific_insn;
243 278
@@ -249,8 +284,8 @@ struct x86_emulate_ctxt {
249}; 284};
250 285
251/* Repeat String Operation Prefix */ 286/* Repeat String Operation Prefix */
252#define REPE_PREFIX 1 287#define REPE_PREFIX 0xf3
253#define REPNE_PREFIX 2 288#define REPNE_PREFIX 0xf2
254 289
255/* Execution mode, passed to the emulator. */ 290/* Execution mode, passed to the emulator. */
256#define X86EMUL_MODE_REAL 0 /* Real mode. */ 291#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -259,6 +294,69 @@ struct x86_emulate_ctxt {
259#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 294#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
260#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 295#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
261 296
297/* any protected mode */
298#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
299 X86EMUL_MODE_PROT64)
300
301enum x86_intercept_stage {
302 X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
303 X86_ICPT_PRE_EXCEPT,
304 X86_ICPT_POST_EXCEPT,
305 X86_ICPT_POST_MEMACCESS,
306};
307
308enum x86_intercept {
309 x86_intercept_none,
310 x86_intercept_cr_read,
311 x86_intercept_cr_write,
312 x86_intercept_clts,
313 x86_intercept_lmsw,
314 x86_intercept_smsw,
315 x86_intercept_dr_read,
316 x86_intercept_dr_write,
317 x86_intercept_lidt,
318 x86_intercept_sidt,
319 x86_intercept_lgdt,
320 x86_intercept_sgdt,
321 x86_intercept_lldt,
322 x86_intercept_sldt,
323 x86_intercept_ltr,
324 x86_intercept_str,
325 x86_intercept_rdtsc,
326 x86_intercept_rdpmc,
327 x86_intercept_pushf,
328 x86_intercept_popf,
329 x86_intercept_cpuid,
330 x86_intercept_rsm,
331 x86_intercept_iret,
332 x86_intercept_intn,
333 x86_intercept_invd,
334 x86_intercept_pause,
335 x86_intercept_hlt,
336 x86_intercept_invlpg,
337 x86_intercept_invlpga,
338 x86_intercept_vmrun,
339 x86_intercept_vmload,
340 x86_intercept_vmsave,
341 x86_intercept_vmmcall,
342 x86_intercept_stgi,
343 x86_intercept_clgi,
344 x86_intercept_skinit,
345 x86_intercept_rdtscp,
346 x86_intercept_icebp,
347 x86_intercept_wbinvd,
348 x86_intercept_monitor,
349 x86_intercept_mwait,
350 x86_intercept_rdmsr,
351 x86_intercept_wrmsr,
352 x86_intercept_in,
353 x86_intercept_ins,
354 x86_intercept_out,
355 x86_intercept_outs,
356
357 nr_x86_intercepts
358};
359
262/* Host execution mode. */ 360/* Host execution mode. */
263#if defined(CONFIG_X86_32) 361#if defined(CONFIG_X86_32)
264#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 362#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -270,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
270#define EMULATION_FAILED -1 368#define EMULATION_FAILED -1
271#define EMULATION_OK 0 369#define EMULATION_OK 0
272#define EMULATION_RESTART 1 370#define EMULATION_RESTART 1
371#define EMULATION_INTERCEPTED 2
273int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); 372int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
274int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 373int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
275 u16 tss_selector, int reason, 374 u16 tss_selector, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8af0991fdf0..d2ac8e2ee897 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
30#define KVM_MEMORY_SLOTS 32 30#define KVM_MEMORY_SLOTS 32
31/* memory slots that does not exposed to userspace */ 31/* memory slots that does not exposed to userspace */
32#define KVM_PRIVATE_MEM_SLOTS 4 32#define KVM_PRIVATE_MEM_SLOTS 4
33#define KVM_MMIO_SIZE 16
33 34
34#define KVM_PIO_PAGE_OFFSET 1 35#define KVM_PIO_PAGE_OFFSET 1
35#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 36#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
36 37
38#define CR0_RESERVED_BITS \
39 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
40 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
41 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
42
37#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 43#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
38#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 44#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
39#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 45#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
40 0xFFFFFF0000000000ULL) 46 0xFFFFFF0000000000ULL)
47#define CR4_RESERVED_BITS \
48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
51 | X86_CR4_OSXSAVE \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
55
56
41 57
42#define INVALID_PAGE (~(hpa_t)0) 58#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE) 59#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -118,6 +134,9 @@ enum kvm_reg {
118enum kvm_reg_ex { 134enum kvm_reg_ex {
119 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 135 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3, 136 VCPU_EXREG_CR3,
137 VCPU_EXREG_RFLAGS,
138 VCPU_EXREG_CPL,
139 VCPU_EXREG_SEGMENTS,
121}; 140};
122 141
123enum { 142enum {
@@ -256,7 +275,7 @@ struct kvm_mmu {
256 struct kvm_mmu_page *sp); 275 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 276 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 277 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq); 278 u64 *spte, const void *pte);
260 hpa_t root_hpa; 279 hpa_t root_hpa;
261 int root_level; 280 int root_level;
262 int shadow_root_level; 281 int shadow_root_level;
@@ -340,7 +359,6 @@ struct kvm_vcpu_arch {
340 struct fpu guest_fpu; 359 struct fpu guest_fpu;
341 u64 xcr0; 360 u64 xcr0;
342 361
343 gva_t mmio_fault_cr2;
344 struct kvm_pio_request pio; 362 struct kvm_pio_request pio;
345 void *pio_data; 363 void *pio_data;
346 364
@@ -367,18 +385,22 @@ struct kvm_vcpu_arch {
367 /* emulate context */ 385 /* emulate context */
368 386
369 struct x86_emulate_ctxt emulate_ctxt; 387 struct x86_emulate_ctxt emulate_ctxt;
388 bool emulate_regs_need_sync_to_vcpu;
389 bool emulate_regs_need_sync_from_vcpu;
370 390
371 gpa_t time; 391 gpa_t time;
372 struct pvclock_vcpu_time_info hv_clock; 392 struct pvclock_vcpu_time_info hv_clock;
373 unsigned int hw_tsc_khz; 393 unsigned int hw_tsc_khz;
374 unsigned int time_offset; 394 unsigned int time_offset;
375 struct page *time_page; 395 struct page *time_page;
376 u64 last_host_tsc;
377 u64 last_guest_tsc; 396 u64 last_guest_tsc;
378 u64 last_kernel_ns; 397 u64 last_kernel_ns;
379 u64 last_tsc_nsec; 398 u64 last_tsc_nsec;
380 u64 last_tsc_write; 399 u64 last_tsc_write;
400 u32 virtual_tsc_khz;
381 bool tsc_catchup; 401 bool tsc_catchup;
402 u32 tsc_catchup_mult;
403 s8 tsc_catchup_shift;
382 404
383 bool nmi_pending; 405 bool nmi_pending;
384 bool nmi_injected; 406 bool nmi_injected;
@@ -448,9 +470,6 @@ struct kvm_arch {
448 u64 last_tsc_nsec; 470 u64 last_tsc_nsec;
449 u64 last_tsc_offset; 471 u64 last_tsc_offset;
450 u64 last_tsc_write; 472 u64 last_tsc_write;
451 u32 virtual_tsc_khz;
452 u32 virtual_tsc_mult;
453 s8 virtual_tsc_shift;
454 473
455 struct kvm_xen_hvm_config xen_hvm_config; 474 struct kvm_xen_hvm_config xen_hvm_config;
456 475
@@ -502,6 +521,8 @@ struct kvm_vcpu_stat {
502 u32 nmi_injections; 521 u32 nmi_injections;
503}; 522};
504 523
524struct x86_instruction_info;
525
505struct kvm_x86_ops { 526struct kvm_x86_ops {
506 int (*cpu_has_kvm_support)(void); /* __init */ 527 int (*cpu_has_kvm_support)(void); /* __init */
507 int (*disabled_by_bios)(void); /* __init */ 528 int (*disabled_by_bios)(void); /* __init */
@@ -586,9 +607,17 @@ struct kvm_x86_ops {
586 607
587 bool (*has_wbinvd_exit)(void); 608 bool (*has_wbinvd_exit)(void);
588 609
610 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
589 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 611 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
590 612
613 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
614
591 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 615 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
616
617 int (*check_intercept)(struct kvm_vcpu *vcpu,
618 struct x86_instruction_info *info,
619 enum x86_intercept_stage stage);
620
592 const struct trace_print_flags *exit_reasons_str; 621 const struct trace_print_flags *exit_reasons_str;
593}; 622};
594 623
@@ -627,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
627 656
628extern bool tdp_enabled; 657extern bool tdp_enabled;
629 658
659/* control of guest tsc rate supported? */
660extern bool kvm_has_tsc_control;
661/* minimum supported tsc_khz for guests */
662extern u32 kvm_min_guest_tsc_khz;
663/* maximum supported tsc_khz for guests */
664extern u32 kvm_max_guest_tsc_khz;
665
630enum emulation_result { 666enum emulation_result {
631 EMULATE_DONE, /* no further processing */ 667 EMULATE_DONE, /* no further processing */
632 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 668 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
@@ -645,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
645 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); 681 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
646} 682}
647 683
648void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
649void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
650
651void kvm_enable_efer_bits(u64); 684void kvm_enable_efer_bits(u64);
652int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 685int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
653int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 686int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -657,8 +690,6 @@ struct x86_emulate_ctxt;
657int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 690int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
658void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
659int kvm_emulate_halt(struct kvm_vcpu *vcpu); 692int kvm_emulate_halt(struct kvm_vcpu *vcpu);
660int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
661int emulate_clts(struct kvm_vcpu *vcpu);
662int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 693int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
663 694
664void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 695void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -721,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
721 752
722int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 753int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
723 754
724int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
725
726int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, 755int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
727 void *insn, int insn_len); 756 void *insn, int insn_len);
728void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 757void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb6..48142971b25d 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,11 +8,6 @@
8 8
9#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
11/*
12 * For 32-bit UML - mark functions implemented in assembly that use
13 * regparm input parameters:
14 */
15#define asmregparm __attribute__((regparm(3)))
16 11
17/* 12/*
18 * Make sure the compiler doesn't do anything stupid with the 13 * Make sure the compiler doesn't do anything stupid with the
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94ae04f..021979a6e23f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
142static inline void enable_p5_mce(void) {} 142static inline void enable_p5_mce(void) {}
143#endif 143#endif
144 144
145extern void (*x86_mce_decode_callback)(struct mce *m);
146
147void mce_setup(struct mce *m); 145void mce_setup(struct mce *m);
148void mce_log(struct mce *m); 146void mce_log(struct mce *m);
149DECLARE_PER_CPU(struct sys_device, mce_dev); 147DECLARE_PER_CPU(struct sys_device, mce_dev);
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806c..5e83a416eca8 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid]) 13#define NODE_DATA(nid) (node_data[nid])
14 14
15#include <asm/numaq.h> 15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34 16
35extern void resume_map_numa_kva(pgd_t *pgd); 17extern void resume_map_numa_kva(pgd_t *pgd);
36 18
37#else /* !CONFIG_NUMA */ 19#else /* !CONFIG_NUMA */
38 20
39#define get_memcfg_numa get_memcfg_numa_flat
40
41static inline void resume_map_numa_kva(pgd_t *pgd) {} 21static inline void resume_map_numa_kva(pgd_t *pgd) {}
42 22
43#endif /* CONFIG_NUMA */ 23#endif /* CONFIG_NUMA */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a6..b3f88d7867c7 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,36 +4,13 @@
4#ifndef _ASM_X86_MMZONE_64_H 4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H 5#define _ASM_X86_MMZONE_64_H
6 6
7
8#ifdef CONFIG_NUMA 7#ifdef CONFIG_NUMA
9 8
10#include <linux/mmdebug.h> 9#include <linux/mmdebug.h>
11
12#include <asm/smp.h> 10#include <asm/smp.h>
13 11
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[]; 12extern struct pglist_data *node_data[];
27 13
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid]) 14#define NODE_DATA(nid) (node_data[nid])
38 15
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 16#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4e..9eae7752ae9b 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
35#define MODULE_PROC_FAMILY "K7 " 35#define MODULE_PROC_FAMILY "K7 "
36#elif defined CONFIG_MK8 36#elif defined CONFIG_MK8
37#define MODULE_PROC_FAMILY "K8 " 37#define MODULE_PROC_FAMILY "K8 "
38#elif defined CONFIG_X86_ELAN 38#elif defined CONFIG_MELAN
39#define MODULE_PROC_FAMILY "ELAN " 39#define MODULE_PROC_FAMILY "ELAN "
40#elif defined CONFIG_MCRUSOE 40#elif defined CONFIG_MCRUSOE
41#define MODULE_PROC_FAMILY "CRUSOE " 41#define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fd5a1f365c95..485b4f1f079b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -96,11 +96,15 @@
96#define MSR_IA32_MC0_ADDR 0x00000402 96#define MSR_IA32_MC0_ADDR 0x00000402
97#define MSR_IA32_MC0_MISC 0x00000403 97#define MSR_IA32_MC0_MISC 0x00000403
98 98
99#define MSR_AMD64_MC0_MASK 0xc0010044
100
99#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) 101#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
100#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) 102#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
101#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) 103#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
102#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) 104#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
103 105
106#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
107
104/* These are consecutive and not in the normal 4er MCE bank block */ 108/* These are consecutive and not in the normal 4er MCE bank block */
105#define MSR_IA32_MC0_CTL2 0x00000280 109#define MSR_IA32_MC0_CTL2 0x00000280
106#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 110#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
@@ -114,6 +118,7 @@
114 complete list. */ 118 complete list. */
115 119
116#define MSR_AMD64_PATCH_LEVEL 0x0000008b 120#define MSR_AMD64_PATCH_LEVEL 0x0000008b
121#define MSR_AMD64_TSC_RATIO 0xc0000104
117#define MSR_AMD64_NB_CFG 0xc001001f 122#define MSR_AMD64_NB_CFG 0xc001001f
118#define MSR_AMD64_PATCH_LOADER 0xc0010020 123#define MSR_AMD64_PATCH_LOADER 0xc0010020
119#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 124#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index af788496020b..405b4032a60b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
1#ifndef _ASM_X86_NOPS_H 1#ifndef _ASM_X86_NOPS_H
2#define _ASM_X86_NOPS_H 2#define _ASM_X86_NOPS_H
3 3
4/* Define nops for use with alternative() */ 4/*
5 * Define nops for use with alternative() and for tracing.
6 *
7 * *_NOP5_ATOMIC must be a single instruction.
8 */
9
10#define NOP_DS_PREFIX 0x3e
5 11
6/* generic versions from gas 12/* generic versions from gas
7 1: nop 13 1: nop
@@ -13,14 +19,15 @@
13 6: leal 0x00000000(%esi),%esi 19 6: leal 0x00000000(%esi),%esi
14 7: leal 0x00000000(,%esi,1),%esi 20 7: leal 0x00000000(,%esi,1),%esi
15*/ 21*/
16#define GENERIC_NOP1 ".byte 0x90\n" 22#define GENERIC_NOP1 0x90
17#define GENERIC_NOP2 ".byte 0x89,0xf6\n" 23#define GENERIC_NOP2 0x89,0xf6
18#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" 24#define GENERIC_NOP3 0x8d,0x76,0x00
19#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" 25#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
20#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 26#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
21#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" 27#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
22#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" 28#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
23#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 29#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
30#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
24 31
25/* Opteron 64bit nops 32/* Opteron 64bit nops
26 1: nop 33 1: nop
@@ -29,13 +36,14 @@
29 4: osp osp osp nop 36 4: osp osp osp nop
30*/ 37*/
31#define K8_NOP1 GENERIC_NOP1 38#define K8_NOP1 GENERIC_NOP1
32#define K8_NOP2 ".byte 0x66,0x90\n" 39#define K8_NOP2 0x66,K8_NOP1
33#define K8_NOP3 ".byte 0x66,0x66,0x90\n" 40#define K8_NOP3 0x66,K8_NOP2
34#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" 41#define K8_NOP4 0x66,K8_NOP3
35#define K8_NOP5 K8_NOP3 K8_NOP2 42#define K8_NOP5 K8_NOP3,K8_NOP2
36#define K8_NOP6 K8_NOP3 K8_NOP3 43#define K8_NOP6 K8_NOP3,K8_NOP3
37#define K8_NOP7 K8_NOP4 K8_NOP3 44#define K8_NOP7 K8_NOP4,K8_NOP3
38#define K8_NOP8 K8_NOP4 K8_NOP4 45#define K8_NOP8 K8_NOP4,K8_NOP4
46#define K8_NOP5_ATOMIC 0x66,K8_NOP4
39 47
40/* K7 nops 48/* K7 nops
41 uses eax dependencies (arbitrary choice) 49 uses eax dependencies (arbitrary choice)
@@ -47,13 +55,14 @@
47 7: leal 0x00000000(,%eax,1),%eax 55 7: leal 0x00000000(,%eax,1),%eax
48*/ 56*/
49#define K7_NOP1 GENERIC_NOP1 57#define K7_NOP1 GENERIC_NOP1
50#define K7_NOP2 ".byte 0x8b,0xc0\n" 58#define K7_NOP2 0x8b,0xc0
51#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" 59#define K7_NOP3 0x8d,0x04,0x20
52#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" 60#define K7_NOP4 0x8d,0x44,0x20,0x00
53#define K7_NOP5 K7_NOP4 ASM_NOP1 61#define K7_NOP5 K7_NOP4,K7_NOP1
54#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" 62#define K7_NOP6 0x8d,0x80,0,0,0,0
55#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" 63#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
56#define K7_NOP8 K7_NOP7 ASM_NOP1 64#define K7_NOP8 K7_NOP7,K7_NOP1
65#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
57 66
58/* P6 nops 67/* P6 nops
59 uses eax dependencies (Intel-recommended choice) 68 uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
69 There is kernel code that depends on this. 78 There is kernel code that depends on this.
70*/ 79*/
71#define P6_NOP1 GENERIC_NOP1 80#define P6_NOP1 GENERIC_NOP1
72#define P6_NOP2 ".byte 0x66,0x90\n" 81#define P6_NOP2 0x66,0x90
73#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" 82#define P6_NOP3 0x0f,0x1f,0x00
74#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" 83#define P6_NOP4 0x0f,0x1f,0x40,0
75#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" 84#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
76#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" 85#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
77#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" 86#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
78#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" 87#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
88#define P6_NOP5_ATOMIC P6_NOP5
89
90#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
79 91
80#if defined(CONFIG_MK7) 92#if defined(CONFIG_MK7)
81#define ASM_NOP1 K7_NOP1 93#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
82#define ASM_NOP2 K7_NOP2 94#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
83#define ASM_NOP3 K7_NOP3 95#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
84#define ASM_NOP4 K7_NOP4 96#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
85#define ASM_NOP5 K7_NOP5 97#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
86#define ASM_NOP6 K7_NOP6 98#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
87#define ASM_NOP7 K7_NOP7 99#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
88#define ASM_NOP8 K7_NOP8 100#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
101#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
89#elif defined(CONFIG_X86_P6_NOP) 102#elif defined(CONFIG_X86_P6_NOP)
90#define ASM_NOP1 P6_NOP1 103#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
91#define ASM_NOP2 P6_NOP2 104#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
92#define ASM_NOP3 P6_NOP3 105#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
93#define ASM_NOP4 P6_NOP4 106#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
94#define ASM_NOP5 P6_NOP5 107#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
95#define ASM_NOP6 P6_NOP6 108#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
96#define ASM_NOP7 P6_NOP7 109#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
97#define ASM_NOP8 P6_NOP8 110#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
111#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
98#elif defined(CONFIG_X86_64) 112#elif defined(CONFIG_X86_64)
99#define ASM_NOP1 K8_NOP1 113#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
100#define ASM_NOP2 K8_NOP2 114#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
101#define ASM_NOP3 K8_NOP3 115#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
102#define ASM_NOP4 K8_NOP4 116#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
103#define ASM_NOP5 K8_NOP5 117#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
104#define ASM_NOP6 K8_NOP6 118#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
105#define ASM_NOP7 K8_NOP7 119#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
106#define ASM_NOP8 K8_NOP8 120#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
121#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
107#else 122#else
108#define ASM_NOP1 GENERIC_NOP1 123#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
109#define ASM_NOP2 GENERIC_NOP2 124#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
110#define ASM_NOP3 GENERIC_NOP3 125#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
111#define ASM_NOP4 GENERIC_NOP4 126#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
112#define ASM_NOP5 GENERIC_NOP5 127#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
113#define ASM_NOP6 GENERIC_NOP6 128#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
114#define ASM_NOP7 GENERIC_NOP7 129#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
115#define ASM_NOP8 GENERIC_NOP8 130#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
131#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
116#endif 132#endif
117 133
118#define ASM_NOP_MAX 8 134#define ASM_NOP_MAX 8
135#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
136
137#ifndef __ASSEMBLY__
138extern const unsigned char * const *ideal_nops;
139extern void arch_init_ideal_nops(void);
140#endif
119 141
120#endif /* _ASM_X86_NOPS_H */ 142#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 3d4dab43c994..bfacd2ccf651 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,12 +1,24 @@
1#ifndef _ASM_X86_NUMA_H 1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H 2#define _ASM_X86_NUMA_H
3 3
4#include <linux/nodemask.h>
5
4#include <asm/topology.h> 6#include <asm/topology.h>
5#include <asm/apicdef.h> 7#include <asm/apicdef.h>
6 8
7#ifdef CONFIG_NUMA 9#ifdef CONFIG_NUMA
8 10
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 11#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
12#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
13
14/*
15 * Too small node sizes may confuse the VM badly. Usually they
16 * result from BIOS bugs. So dont recognize nodes as standalone
17 * NUMA entities that have less than this amount of RAM listed:
18 */
19#define NODE_MIN_SIZE (4*1024*1024)
20
21extern int numa_off;
10 22
11/* 23/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and 24 * __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -17,15 +29,27 @@
17 * numa_cpu_node(). 29 * numa_cpu_node().
18 */ 30 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC]; 31extern s16 __apicid_to_node[MAX_LOCAL_APIC];
32extern nodemask_t numa_nodes_parsed __initdata;
33
34extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
35extern void __init numa_set_distance(int from, int to, int distance);
20 36
21static inline void set_apicid_to_node(int apicid, s16 node) 37static inline void set_apicid_to_node(int apicid, s16 node)
22{ 38{
23 __apicid_to_node[apicid] = node; 39 __apicid_to_node[apicid] = node;
24} 40}
41
42extern int __cpuinit numa_cpu_node(int cpu);
43
25#else /* CONFIG_NUMA */ 44#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node) 45static inline void set_apicid_to_node(int apicid, s16 node)
27{ 46{
28} 47}
48
49static inline int numa_cpu_node(int cpu)
50{
51 return NUMA_NO_NODE;
52}
29#endif /* CONFIG_NUMA */ 53#endif /* CONFIG_NUMA */
30 54
31#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -37,21 +61,25 @@ static inline void set_apicid_to_node(int apicid, s16 node)
37#ifdef CONFIG_NUMA 61#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node); 62extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu); 63extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void); 64extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu); 65extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu); 66extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */ 67#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { } 68static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { } 69static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { } 70static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { } 71static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { } 72static inline void numa_remove_cpu(int cpu) { }
51#endif /* CONFIG_NUMA */ 73#endif /* CONFIG_NUMA */
52 74
53#ifdef CONFIG_DEBUG_PER_CPU_MAPS 75#ifdef CONFIG_DEBUG_PER_CPU_MAPS
54struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); 76void debug_cpumask_set_cpu(int cpu, int node, bool enable);
55#endif 77#endif
56 78
79#ifdef CONFIG_NUMA_EMU
80#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
81#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
82void numa_emu_cmdline(char *);
83#endif /* CONFIG_NUMA_EMU */
84
57#endif /* _ASM_X86_NUMA_H */ 85#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef103..e7d6b8254742 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,16 +1,6 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
6extern int pxm_to_nid(int pxm);
7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
13
14#ifdef CONFIG_HIGHMEM 4#ifdef CONFIG_HIGHMEM
15extern void set_highmem_pages_init(void); 5extern void set_highmem_pages_init(void);
16#else 6#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b46..0c05f7ae46e8 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,42 +1,6 @@
1#ifndef _ASM_X86_NUMA_64_H 1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h>
5
6struct bootnode {
7 u64 start;
8 u64 end;
9};
10
11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
12
13extern int numa_off;
14
15extern unsigned long numa_free_all_bootmem(void); 4extern unsigned long numa_free_all_bootmem(void);
16extern void setup_node_bootmem(int nodeid, unsigned long start,
17 unsigned long end);
18
19#ifdef CONFIG_NUMA
20/*
21 * Too small node sizes may confuse the VM badly. Usually they
22 * result from BIOS bugs. So dont recognize nodes as standalone
23 * NUMA entities that have less than this amount of RAM listed:
24 */
25#define NODE_MIN_SIZE (4*1024*1024)
26
27extern nodemask_t numa_nodes_parsed __initdata;
28
29extern int __cpuinit numa_cpu_node(int cpu);
30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
31extern void __init numa_set_distance(int from, int to, int distance);
32
33#ifdef CONFIG_NUMA_EMU
34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
35#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
36void numa_emu_cmdline(char *);
37#endif /* CONFIG_NUMA_EMU */
38#else
39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
40#endif
41 5
42#endif /* _ASM_X86_NUMA_64_H */ 6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec8..c3b3c322fd87 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
29#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int numaq_numa_init(void);
33extern int pci_numaq_init(void); 33extern int pci_numaq_init(void);
34 34
35extern void *xquad_portio; 35extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
166 166
167void numaq_tsc_disable(void); 167void numaq_tsc_disable(void);
168 168
169#else
170static inline int get_memcfg_numaq(void)
171{
172 return 0;
173}
174#endif /* CONFIG_X86_NUMAQ */ 169#endif /* CONFIG_X86_NUMAQ */
175#endif /* _ASM_X86_NUMAQ_H */ 170#endif /* _ASM_X86_NUMAQ_H */
176 171
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9f..24487712e0b1 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29extern void olpc_dt_build_devicetree(void);
30
29#else /* !CONFIG_OLPC */ 31#else /* !CONFIG_OLPC */
30static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
31static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
32#endif /* !CONFIG_OLPC */
33
34#ifdef CONFIG_OF_PROMTREE
35extern void olpc_dt_build_devicetree(void);
36#else
37static inline void olpc_dt_build_devicetree(void) { } 34static inline void olpc_dt_build_devicetree(void) { }
38#endif 35#endif /* !CONFIG_OLPC */
39 36
40#endif /* _ASM_X86_OLPC_OFW_H */ 37#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 676129229630..d498943b906c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev);
135#include "pci_64.h" 135#include "pci_64.h"
136#endif 136#endif
137 137
138void dma32_reserve_bootmem(void);
139
140/* implement the pci_ DMA API in terms of the generic device dma_ one */ 138/* implement the pci_ DMA API in terms of the generic device dma_ one */
141#include <asm-generic/pci-dma-compat.h> 139#include <asm-generic/pci-dma-compat.h>
142 140
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8b..a0a9779084d1 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -509,6 +509,11 @@ do { \
509 * it in software. The address used in the cmpxchg16 instruction must be 509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary. 510 * aligned to a 16 byte boundary.
511 */ 511 */
512#ifdef CONFIG_SMP
513#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
514#else
515#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
516#endif
512#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ 517#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
513({ \ 518({ \
514 char __ret; \ 519 char __ret; \
@@ -517,7 +522,7 @@ do { \
517 typeof(o2) __o2 = o2; \ 522 typeof(o2) __o2 = o2; \
518 typeof(o2) __n2 = n2; \ 523 typeof(o2) __n2 = n2; \
519 typeof(o2) __dummy; \ 524 typeof(o2) __dummy; \
520 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ 525 alternative_io(CMPXCHG16B_EMU_CALL, \
521 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ 526 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
522 X86_FEATURE_CX16, \ 527 X86_FEATURE_CX16, \
523 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ 528 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
@@ -542,6 +547,33 @@ do { \
542 old__; \ 547 old__; \
543}) 548})
544 549
550static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
551 const unsigned long __percpu *addr)
552{
553 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
554
555 return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
556}
557
558static inline int x86_this_cpu_variable_test_bit(int nr,
559 const unsigned long __percpu *addr)
560{
561 int oldbit;
562
563 asm volatile("bt "__percpu_arg(2)",%1\n\t"
564 "sbb %0,%0"
565 : "=r" (oldbit)
566 : "m" (*(unsigned long *)addr), "Ir" (nr));
567
568 return oldbit;
569}
570
571#define x86_this_cpu_test_bit(nr, addr) \
572 (__builtin_constant_p((nr)) \
573 ? x86_this_cpu_constant_test_bit((nr), (addr)) \
574 : x86_this_cpu_variable_test_bit((nr), (addr)))
575
576
545#include <asm-generic/percpu.h> 577#include <asm-generic/percpu.h>
546 578
547/* We can use this directly for local CPU (faster). */ 579/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723d1f32..d56187c6b838 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
299/* Install a pte for a particular vaddr in kernel space. */ 299/* Install a pte for a particular vaddr in kernel space. */
300void set_pte_vaddr(unsigned long vaddr, pte_t pte); 300void set_pte_vaddr(unsigned long vaddr, pte_t pte);
301 301
302extern void native_pagetable_reserve(u64 start, u64 end);
302#ifdef CONFIG_X86_32 303#ifdef CONFIG_X86_32
303extern void native_pagetable_setup_start(pgd_t *base); 304extern void native_pagetable_setup_start(pgd_t *base);
304extern void native_pagetable_setup_done(pgd_t *base); 305extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000000..4950a0b1d09c
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
1#ifndef _PROBE_ROMS_H_
2#define _PROBE_ROMS_H_
3struct pci_dev;
4
5extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
6extern void pci_unmap_biosrom(void __iomem *rom);
7extern size_t pci_biosrom_size(struct pci_dev *pdev);
8#endif
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a898a2b6e10c..59ab4dffa377 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -60,6 +60,7 @@
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
63 64
64/* 65/*
65 * x86-64 Task Priority Register, CR8 66 * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 1babf8adecdf..94e7618fcac8 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,6 +136,7 @@ struct cpuinfo_x86;
136struct task_struct; 136struct task_struct;
137 137
138extern unsigned long profile_pc(struct pt_regs *regs); 138extern unsigned long profile_pc(struct pt_regs *regs);
139#define profile_pc profile_pc
139 140
140extern unsigned long 141extern unsigned long
141convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); 142convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
202#endif 203#endif
203} 204}
204 205
205static inline unsigned long instruction_pointer(struct pt_regs *regs) 206#define GET_IP(regs) ((regs)->ip)
206{ 207#define GET_FP(regs) ((regs)->bp)
207 return regs->ip; 208#define GET_USP(regs) ((regs)->sp)
208}
209
210static inline unsigned long frame_pointer(struct pt_regs *regs)
211{
212 return regs->bp;
213}
214 209
215static inline unsigned long user_stack_pointer(struct pt_regs *regs) 210#include <asm-generic/ptrace.h>
216{
217 return regs->sp;
218}
219 211
220/* Query offset/name of register from its name/offset */ 212/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name); 213extern int regs_query_register_offset(const char *name);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a2..9756551ec760 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
88 * executable.) 88 * executable.)
89 */ 89 */
90#define RESERVE_BRK(name,sz) \ 90#define RESERVE_BRK(name,sz) \
91 static void __section(.discard.text) __used \ 91 static void __section(.discard.text) __used notrace \
92 __brk_reservation_fn_##name##__(void) { \ 92 __brk_reservation_fn_##name##__(void) { \
93 asm volatile ( \ 93 asm volatile ( \
94 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 94 ".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
104 type *name; \ 104 type *name; \
105 RESERVE_BRK(name, sizeof(type) * entries) 105 RESERVE_BRK(name, sizeof(type) * entries)
106 106
107extern void probe_roms(void);
107#ifdef __i386__ 108#ifdef __i386__
108 109
109void __init i386_start_kernel(void); 110void __init i386_start_kernel(void);
110extern void probe_roms(void);
111 111
112#else 112#else
113void __init x86_64_start_kernel(char *real_mode); 113void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a7..000000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index d7e89c83645d..70bbe39043a9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
37/* Generic stack tracer with callbacks */ 37/* Generic stack tracer with callbacks */
38 38
39struct stacktrace_ops { 39struct stacktrace_ops {
40 void (*warning)(void *data, char *msg);
41 /* msg must contain %s for the symbol */
42 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
43 void (*address)(void *data, unsigned long address, int reliable); 40 void (*address)(void *data, unsigned long address, int reliable);
44 /* On negative return stop dumping */ 41 /* On negative return stop dumping */
45 int (*stack)(void *data, char *name); 42 int (*stack)(void *data, char *name);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fd921c3a6841..487055c8c1aa 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,8 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void) { return 0; }
13
14/* image of the saved processor state */ 12/* image of the saved processor state */
15struct saved_context { 13struct saved_context {
16 u16 es, fs, gs, ss; 14 u16 es, fs, gs, ss;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 8d942afae681..09b0bf104156 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,11 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void)
13{
14 return 0;
15}
16
17/* 12/*
18 * Image of the saved processor state, used by the low level ACPI suspend to 13 * Image of the saved processor state, used by the low level ACPI suspend to
19 * RAM code and by the low level hibernation code. 14 * RAM code and by the low level hibernation code.
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 12569e691ce3..c2ff2a1d845e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -303,24 +303,81 @@ static inline void native_wbinvd(void)
303#ifdef CONFIG_PARAVIRT 303#ifdef CONFIG_PARAVIRT
304#include <asm/paravirt.h> 304#include <asm/paravirt.h>
305#else 305#else
306#define read_cr0() (native_read_cr0()) 306
307#define write_cr0(x) (native_write_cr0(x)) 307static inline unsigned long read_cr0(void)
308#define read_cr2() (native_read_cr2()) 308{
309#define write_cr2(x) (native_write_cr2(x)) 309 return native_read_cr0();
310#define read_cr3() (native_read_cr3()) 310}
311#define write_cr3(x) (native_write_cr3(x)) 311
312#define read_cr4() (native_read_cr4()) 312static inline void write_cr0(unsigned long x)
313#define read_cr4_safe() (native_read_cr4_safe()) 313{
314#define write_cr4(x) (native_write_cr4(x)) 314 native_write_cr0(x);
315#define wbinvd() (native_wbinvd()) 315}
316
317static inline unsigned long read_cr2(void)
318{
319 return native_read_cr2();
320}
321
322static inline void write_cr2(unsigned long x)
323{
324 native_write_cr2(x);
325}
326
327static inline unsigned long read_cr3(void)
328{
329 return native_read_cr3();
330}
331
332static inline void write_cr3(unsigned long x)
333{
334 native_write_cr3(x);
335}
336
337static inline unsigned long read_cr4(void)
338{
339 return native_read_cr4();
340}
341
342static inline unsigned long read_cr4_safe(void)
343{
344 return native_read_cr4_safe();
345}
346
347static inline void write_cr4(unsigned long x)
348{
349 native_write_cr4(x);
350}
351
352static inline void wbinvd(void)
353{
354 native_wbinvd();
355}
356
316#ifdef CONFIG_X86_64 357#ifdef CONFIG_X86_64
317#define read_cr8() (native_read_cr8()) 358
318#define write_cr8(x) (native_write_cr8(x)) 359static inline unsigned long read_cr8(void)
319#define load_gs_index native_load_gs_index 360{
361 return native_read_cr8();
362}
363
364static inline void write_cr8(unsigned long x)
365{
366 native_write_cr8(x);
367}
368
369static inline void load_gs_index(unsigned selector)
370{
371 native_load_gs_index(selector);
372}
373
320#endif 374#endif
321 375
322/* Clear the 'TS' bit */ 376/* Clear the 'TS' bit */
323#define clts() (native_clts()) 377static inline void clts(void)
378{
379 native_clts();
380}
324 381
325#endif/* CONFIG_PARAVIRT */ 382#endif/* CONFIG_PARAVIRT */
326 383
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f2..c00692476e9f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
96extern unsigned long node_start_pfn[];
97extern unsigned long node_end_pfn[];
98extern unsigned long node_remap_size[];
99#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
100
101# define SD_CACHE_NICE_TRIES 1 96# define SD_CACHE_NICE_TRIES 1
102# define SD_IDLE_IDX 1 97# define SD_IDLE_IDX 1
103
104#else 98#else
105
106# define SD_CACHE_NICE_TRIES 2 99# define SD_CACHE_NICE_TRIES 2
107# define SD_IDLE_IDX 2 100# define SD_IDLE_IDX 2
108
109#endif 101#endif
110 102
111/* sched_domains SD_NODE_INIT for NUMA machines */ 103/* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd181e2..9db5583b6d38 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54#ifdef CONFIG_X86_64
55extern cycles_t vread_tsc(void);
56#endif
57
54/* 58/*
55 * Boot-time check whether the TSCs are synchronized across 59 * Boot-time check whether the TSCs are synchronized across
56 * all CPUs/cores: 60 * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762a..99ddd148a760 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,7 +6,6 @@
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
@@ -42,7 +41,7 @@
42 * Returns 0 if the range is valid, nonzero otherwise. 41 * Returns 0 if the range is valid, nonzero otherwise.
43 * 42 *
44 * This is equivalent to the following test: 43 * This is equivalent to the following test:
45 * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) 44 * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
46 * 45 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... 46 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */ 47 */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 088d09fb1615..566e803cc602 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 316708d5af92..1c66d30971ad 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 9#include <linux/lockdep.h>
11#include <asm/alternative.h> 10#include <asm/alternative.h>
12#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5e5977..fb6a625c99bf 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,11 @@
350#define __NR_open_by_handle_at 342 350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343 351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344 352#define __NR_syncfs 344
353#define __NR_sendmmsg 345
353 354
354#ifdef __KERNEL__ 355#ifdef __KERNEL__
355 356
356#define NR_syscalls 345 357#define NR_syscalls 346
357 358
358#define __ARCH_WANT_IPC_PARSE_VERSION 359#define __ARCH_WANT_IPC_PARSE_VERSION
359#define __ARCH_WANT_OLD_READDIR 360#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76bd578..79f90eb15aad 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,8 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) 677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306 678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs) 679__SYSCALL(__NR_syncfs, sys_syncfs)
680#define __NR_sendmmsg 307
681__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
680 682
681#ifndef __NO_STUBS 683#ifndef __NO_STUBS
682#define __ARCH_WANT_OLD_READDIR 684#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 3e094af443c3..130f1eeee5fe 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -94,6 +94,8 @@
94/* after this # consecutive successes, bump up the throttle if it was lowered */ 94/* after this # consecutive successes, bump up the throttle if it was lowered */
95#define COMPLETE_THRESHOLD 5 95#define COMPLETE_THRESHOLD 5
96 96
97#define UV_LB_SUBNODEID 0x10
98
97/* 99/*
98 * number of entries in the destination side payload queue 100 * number of entries in the destination side payload queue
99 */ 101 */
@@ -124,7 +126,7 @@
124 * The distribution specification (32 bytes) is interpreted as a 256-bit 126 * The distribution specification (32 bytes) is interpreted as a 256-bit
125 * distribution vector. Adjacent bits correspond to consecutive even numbered 127 * distribution vector. Adjacent bits correspond to consecutive even numbered
126 * nodeIDs. The result of adding the index of a given bit to the 15-bit 128 * nodeIDs. The result of adding the index of a given bit to the 15-bit
127 * 'base_dest_nodeid' field of the header corresponds to the 129 * 'base_dest_nasid' field of the header corresponds to the
128 * destination nodeID associated with that specified bit. 130 * destination nodeID associated with that specified bit.
129 */ 131 */
130struct bau_target_uvhubmask { 132struct bau_target_uvhubmask {
@@ -176,7 +178,7 @@ struct bau_msg_payload {
176struct bau_msg_header { 178struct bau_msg_header {
177 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 179 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
178 /* bits 5:0 */ 180 /* bits 5:0 */
179 unsigned int base_dest_nodeid:15; /* nasid of the */ 181 unsigned int base_dest_nasid:15; /* nasid of the */
180 /* bits 20:6 */ /* first bit in uvhub map */ 182 /* bits 20:6 */ /* first bit in uvhub map */
181 unsigned int command:8; /* message type */ 183 unsigned int command:8; /* message type */
182 /* bits 28:21 */ 184 /* bits 28:21 */
@@ -378,6 +380,10 @@ struct ptc_stats {
378 unsigned long d_rcanceled; /* number of messages canceled by resets */ 380 unsigned long d_rcanceled; /* number of messages canceled by resets */
379}; 381};
380 382
383struct hub_and_pnode {
384 short uvhub;
385 short pnode;
386};
381/* 387/*
382 * one per-cpu; to locate the software tables 388 * one per-cpu; to locate the software tables
383 */ 389 */
@@ -399,10 +405,12 @@ struct bau_control {
399 int baudisabled; 405 int baudisabled;
400 int set_bau_off; 406 int set_bau_off;
401 short cpu; 407 short cpu;
408 short osnode;
402 short uvhub_cpu; 409 short uvhub_cpu;
403 short uvhub; 410 short uvhub;
404 short cpus_in_socket; 411 short cpus_in_socket;
405 short cpus_in_uvhub; 412 short cpus_in_uvhub;
413 short partition_base_pnode;
406 unsigned short message_number; 414 unsigned short message_number;
407 unsigned short uvhub_quiesce; 415 unsigned short uvhub_quiesce;
408 short socket_acknowledge_count[DEST_Q_SIZE]; 416 short socket_acknowledge_count[DEST_Q_SIZE];
@@ -422,15 +430,16 @@ struct bau_control {
422 int congested_period; 430 int congested_period;
423 cycles_t period_time; 431 cycles_t period_time;
424 long period_requests; 432 long period_requests;
433 struct hub_and_pnode *target_hub_and_pnode;
425}; 434};
426 435
427static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) 436static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
428{ 437{
429 return constant_test_bit(uvhub, &dstp->bits[0]); 438 return constant_test_bit(uvhub, &dstp->bits[0]);
430} 439}
431static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) 440static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
432{ 441{
433 __set_bit(uvhub, &dstp->bits[0]); 442 __set_bit(pnode, &dstp->bits[0]);
434} 443}
435static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, 444static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
436 int nbits) 445 int nbits)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a501741c2335..4298002d0c83 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -398,6 +398,8 @@ struct uv_blade_info {
398 unsigned short nr_online_cpus; 398 unsigned short nr_online_cpus;
399 unsigned short pnode; 399 unsigned short pnode;
400 short memory_nid; 400 short memory_nid;
401 spinlock_t nmi_lock;
402 unsigned long nmi_count;
401}; 403};
402extern struct uv_blade_info *uv_blade_info; 404extern struct uv_blade_info *uv_blade_info;
403extern short *uv_node_to_blade; 405extern short *uv_node_to_blade;
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 20cafeac7455..f5bb64a823d7 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV MMR definitions 6 * SGI UV MMR definitions
7 * 7 *
8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_MMRS_H 11#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u {
1099 } s; 1099 } s;
1100}; 1100};
1101 1101
1102/* ========================================================================= */
1103/* UVH_SCRATCH5 */
1104/* ========================================================================= */
1105#define UVH_SCRATCH5 0x2d0200UL
1106#define UVH_SCRATCH5_32 0x00778
1107
1108#define UVH_SCRATCH5_SCRATCH5_SHFT 0
1109#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
1110union uvh_scratch5_u {
1111 unsigned long v;
1112 struct uvh_scratch5_s {
1113 unsigned long scratch5 : 64; /* RW, W1CS */
1114 } s;
1115};
1102 1116
1103#endif /* __ASM_UV_MMRS_X86_H__ */ 1117#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73de..bb0522850b74 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
1#ifndef _ASM_X86_VDSO_H 1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H 2#define _ASM_X86_VDSO_H
3 3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) \
12({ \
13 extern const char VDSO64_##name[]; \
14 (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
15})
16#endif
17
18#if defined CONFIG_X86_32 || defined CONFIG_COMPAT 4#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
19extern const char VDSO32_PRELINK[]; 5extern const char VDSO32_PRELINK[];
20 6
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826f..646b4c1ca695 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
23 struct timespec wall_to_monotonic; 23 struct timespec wall_to_monotonic;
24 struct timespec wall_time_coarse; 24 struct timespec wall_time_coarse;
25}; 25};
26extern struct vsyscall_gtod_data __vsyscall_gtod_data
27__section_vsyscall_gtod_data;
28extern struct vsyscall_gtod_data vsyscall_gtod_data; 26extern struct vsyscall_gtod_data vsyscall_gtod_data;
29 27
30#endif /* _ASM_X86_VGTOD_H */ 28#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fbd..d55597351f6a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,19 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
21
22/* Definitions for CONFIG_GENERIC_TIME definitions */ 19/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn \ 20#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace 21 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
29 22
30#define VGETCPU_RDTSCP 1 23#define VGETCPU_RDTSCP 1
31#define VGETCPU_LSL 2 24#define VGETCPU_LSL 2
32 25
33extern int __vgetcpu_mode;
34extern volatile unsigned long __jiffies;
35
36/* kernel space (writeable) */ 26/* kernel space (writeable) */
37extern int vgetcpu_mode; 27extern int vgetcpu_mode;
38extern struct timezone sys_tz; 28extern struct timezone sys_tz;
39 29
30#include <asm/vvar.h>
31
40extern void map_vsyscall(void); 32extern void map_vsyscall(void);
41 33
42#endif /* __KERNEL__ */ 34#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 000000000000..341b3559452b
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,52 @@
1/*
2 * vvar.h: Shared vDSO/kernel variable declarations
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * A handful of variables are accessible (read-only) from userspace
7 * code in the vsyscall page and the vdso. They are declared here.
8 * Some other file must define them with DEFINE_VVAR.
9 *
10 * In normal kernel code, they are used like any other variable.
11 * In user code, they are accessed through the VVAR macro.
12 *
13 * Each of these variables lives in the vsyscall page, and each
14 * one needs a unique offset within the little piece of the page
15 * reserved for vvars. Specify that offset in DECLARE_VVAR.
16 * (There are 896 bytes available. If you mess up, the linker will
17 * catch it.)
18 */
19
20/* Offset of vars within vsyscall page */
21#define VSYSCALL_VARS_OFFSET (3072 + 128)
22
23#if defined(__VVAR_KERNEL_LDS)
24
25/* The kernel linker script defines its own magic to put vvars in the
26 * right place.
27 */
28#define DECLARE_VVAR(offset, type, name) \
29 EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
30
31#else
32
33#define DECLARE_VVAR(offset, type, name) \
34 static type const * const vvaraddr_ ## name = \
35 (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
36
37#define DEFINE_VVAR(type, name) \
38 type __vvar_ ## name \
39 __attribute__((section(".vsyscall_var_" #name), aligned(16)))
40
41#define VVAR(name) (*vvaraddr_ ## name)
42
43#endif
44
45/* DECLARE_VVAR(offset, type, name) */
46
47DECLARE_VVAR(0, volatile unsigned long, jiffies)
48DECLARE_VVAR(8, int, vgetcpu_mode)
49DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
50
51#undef DECLARE_VVAR
52#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
new file mode 100644
index 000000000000..6bf5b8e478c0
--- /dev/null
+++ b/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,62 @@
1/*
2 * Common bits for X2APIC cluster/physical modes.
3 */
4
5#ifndef _ASM_X86_X2APIC_H
6#define _ASM_X86_X2APIC_H
7
8#include <asm/apic.h>
9#include <asm/ipi.h>
10#include <linux/cpumask.h>
11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_registered(void)
22{
23 return 1;
24}
25
26/*
27 * For now each logical cpu is in its own vector allocation domain.
28 */
29static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
30{
31 cpumask_clear(retmask);
32 cpumask_set_cpu(cpu, retmask);
33}
34
35static void
36__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
37{
38 unsigned long cfg = __prepare_ICR(0, vector, dest);
39 native_x2apic_icr_write(cfg, apicid);
40}
41
42static unsigned int x2apic_get_apic_id(unsigned long id)
43{
44 return id;
45}
46
47static unsigned long x2apic_set_apic_id(unsigned int id)
48{
49 return id;
50}
51
52static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
53{
54 return initial_apicid >> index_msb;
55}
56
57static void x2apic_send_IPI_self(int vector)
58{
59 apic_write(APIC_SELF_IPI, vector);
60}
61
62#endif /* _ASM_X86_X2APIC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2e2ad8..d3d859035af9 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,17 @@ struct x86_init_oem {
68}; 68};
69 69
70/** 70/**
71 * struct x86_init_mapping - platform specific initial kernel pagetable setup
72 * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
73 *
74 * For more details on the purpose of this hook, look in
75 * init_memory_mapping and the commit that added it.
76 */
77struct x86_init_mapping {
78 void (*pagetable_reserve)(u64 start, u64 end);
79};
80
81/**
71 * struct x86_init_paging - platform specific paging functions 82 * struct x86_init_paging - platform specific paging functions
72 * @pagetable_setup_start: platform specific pre paging_init() call 83 * @pagetable_setup_start: platform specific pre paging_init() call
73 * @pagetable_setup_done: platform specific post paging_init() call 84 * @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +134,7 @@ struct x86_init_ops {
123 struct x86_init_mpparse mpparse; 134 struct x86_init_mpparse mpparse;
124 struct x86_init_irqs irqs; 135 struct x86_init_irqs irqs;
125 struct x86_init_oem oem; 136 struct x86_init_oem oem;
137 struct x86_init_mapping mapping;
126 struct x86_init_paging paging; 138 struct x86_init_paging paging;
127 struct x86_init_timers timers; 139 struct x86_init_timers timers;
128 struct x86_init_iommu iommu; 140 struct x86_init_iommu iommu;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe52296..d240ea950519 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
447 return _hypercall2(unsigned long, hvm_op, op, arg); 447 return _hypercall2(unsigned long, hvm_op, op, arg);
448} 448}
449 449
450static inline int
451HYPERVISOR_tmem_op(
452 struct tmem_op *op)
453{
454 return _hypercall1(int, tmem_op, op);
455}
456
450static inline void 457static inline void
451MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 458MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
452{ 459{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22a..64a619d47d34 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s, 47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e); 48 unsigned long pfn_e);
49 49
50extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page,
51extern int m2p_remove_override(struct page *page); 51 bool clear_pte);
52extern int m2p_remove_override(struct page *page, bool clear_pte);
52extern struct page *m2p_find_override(unsigned long mfn); 53extern struct page *m2p_find_override(unsigned long mfn);
53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 54extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
54 55
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index aa8620989162..4fbda9a3f339 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void)
15#endif 15#endif
16#if defined(CONFIG_XEN_DOM0) 16#if defined(CONFIG_XEN_DOM0)
17void __init xen_setup_pirqs(void); 17void __init xen_setup_pirqs(void);
18int xen_find_device_domain_owner(struct pci_dev *dev);
19int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
20int xen_unregister_device_domain_owner(struct pci_dev *dev);
18#else 21#else
19static inline void __init xen_setup_pirqs(void) 22static inline void __init xen_setup_pirqs(void)
20{ 23{
21} 24}
25static inline int xen_find_device_domain_owner(struct pci_dev *dev)
26{
27 return -1;
28}
29static inline int xen_register_device_domain_owner(struct pci_dev *dev,
30 uint16_t domain)
31{
32 return -1;
33}
34static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
35{
36 return -1;
37}
22#endif 38#endif
23 39
24#if defined(CONFIG_PCI_MSI) 40#if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218bc..f5abe3a245b8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FUNCTION_TRACER 9ifdef CONFIG_FUNCTION_TRACER
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 11CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 12CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_pvclock.o = -pg 13CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,22 +23,25 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 23nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 24CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 25CFLAGS_hpet.o := $(nostackp)
27CFLAGS_tsc.o := $(nostackp) 26CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 27CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 28GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 29GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 30GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_paravirt.o := n 31GCOV_PROFILE_paravirt.o := n
33 32
33# vread_tsc_64 is hot and should be fully optimized:
34CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
35
34obj-y := process_$(BITS).o signal.o entry_$(BITS).o 36obj-y := process_$(BITS).o signal.o entry_$(BITS).o
35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 37obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
36obj-y += time.o ioport.o ldt.o dumpstack.o 38obj-y += time.o ioport.o ldt.o dumpstack.o
37obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 39obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
38obj-$(CONFIG_IRQ_WORK) += irq_work.o 40obj-$(CONFIG_IRQ_WORK) += irq_work.o
39obj-$(CONFIG_X86_32) += probe_roms_32.o 41obj-y += probe_roms.o
40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 42obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 43obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 44obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
43obj-y += bootflag.o e820.o 45obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 46obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 47obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -117,7 +119,7 @@ obj-$(CONFIG_OF) += devicetree.o
117ifeq ($(CONFIG_X86_64),y) 119ifeq ($(CONFIG_X86_64),y)
118 obj-$(CONFIG_AUDIT) += audit_64.o 120 obj-$(CONFIG_AUDIT) += audit_64.o
119 121
120 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 122 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
121 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 123 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
122 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 124 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
123 125
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9a966c579af5..4558f0d0822d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
970 mp_irq.irqflag = (trigger << 2) | polarity; 970 mp_irq.irqflag = (trigger << 2) | polarity;
971 mp_irq.srcbus = MP_ISA_BUS; 971 mp_irq.srcbus = MP_ISA_BUS;
972 mp_irq.srcbusirq = bus_irq; /* IRQ */ 972 mp_irq.srcbusirq = bus_irq; /* IRQ */
973 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ 973 mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
974 mp_irq.dstirq = pin; /* INTIN# */ 974 mp_irq.dstirq = pin; /* INTIN# */
975 975
976 mp_save_irq(&mp_irq); 976 mp_save_irq(&mp_irq);
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 if (ioapic < 0) 1021 if (ioapic < 0)
1022 continue; 1022 continue;
1023 pin = mp_find_ioapic_pin(ioapic, gsi); 1023 pin = mp_find_ioapic_pin(ioapic, gsi);
1024 dstapic = mp_ioapics[ioapic].apicid; 1024 dstapic = mpc_ioapic_id(ioapic);
1025 1025
1026 for (idx = 0; idx < mp_irq_entries; idx++) { 1026 for (idx = 0; idx < mp_irq_entries; idx++) {
1027 struct mpc_intsrc *irq = mp_irqs + idx; 1027 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1082 mp_irq.srcbus = number; 1082 mp_irq.srcbus = number;
1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1084 ioapic = mp_find_ioapic(gsi); 1084 ioapic = mp_find_ioapic(gsi);
1085 mp_irq.dstapic = mp_ioapics[ioapic].apicid; 1085 mp_irq.dstapic = mpc_ioapic_id(ioapic);
1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); 1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1087 1087
1088 mp_save_irq(&mp_irq); 1088 mp_save_irq(&mp_irq);
@@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1113 1113
1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1115 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1115 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1116 "%d-%d\n", mp_ioapics[ioapic].apicid, 1116 "%d-%d\n", mpc_ioapic_id(ioapic),
1117 ioapic_pin); 1117 ioapic_pin);
1118 return gsi; 1118 return gsi;
1119 } 1119 }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c3..18a857ba7a25 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -112,11 +112,6 @@ static int __init acpi_sleep_setup(char *str)
112#ifdef CONFIG_HIBERNATION 112#ifdef CONFIG_HIBERNATION
113 if (strncmp(str, "s4_nohwsig", 10) == 0) 113 if (strncmp(str, "s4_nohwsig", 10) == 0)
114 acpi_no_s4_hw_signature(); 114 acpi_no_s4_hw_signature();
115 if (strncmp(str, "s4_nonvs", 8) == 0) {
116 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
117 "please use acpi_sleep=nonvs instead");
118 acpi_nvs_nosave();
119 }
120#endif 115#endif
121 if (strncmp(str, "nonvs", 5) == 0) 116 if (strncmp(str, "nonvs", 5) == 0)
122 acpi_nvs_nosave(); 117 acpi_nvs_nosave();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e213..a81f2d52f869 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
67#define DPRINTK(fmt, args...) if (debug_alternative) \ 67#define DPRINTK(fmt, args...) if (debug_alternative) \
68 printk(KERN_DEBUG fmt, args) 68 printk(KERN_DEBUG fmt, args)
69 69
70/*
71 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
72 * that correspond to that nop. Getting from one nop to the next, we
73 * add to the array the offset that is equal to the sum of all sizes of
74 * nops preceding the one we are after.
75 *
76 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
77 * nice symmetry of sizes of the previous nops.
78 */
70#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) 79#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
71/* Use inline assembly to define this because the nops are defined 80static const unsigned char intelnops[] =
72 as inline assembly strings in the include files and we cannot 81{
73 get them easily into strings. */ 82 GENERIC_NOP1,
74asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " 83 GENERIC_NOP2,
75 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 84 GENERIC_NOP3,
76 GENERIC_NOP7 GENERIC_NOP8 85 GENERIC_NOP4,
77 "\t.previous"); 86 GENERIC_NOP5,
78extern const unsigned char intelnops[]; 87 GENERIC_NOP6,
79static const unsigned char *const __initconst_or_module 88 GENERIC_NOP7,
80intel_nops[ASM_NOP_MAX+1] = { 89 GENERIC_NOP8,
90 GENERIC_NOP5_ATOMIC
91};
92static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
93{
81 NULL, 94 NULL,
82 intelnops, 95 intelnops,
83 intelnops + 1, 96 intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
87 intelnops + 1 + 2 + 3 + 4 + 5, 100 intelnops + 1 + 2 + 3 + 4 + 5,
88 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 101 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
89 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
90}; 104};
91#endif 105#endif
92 106
93#ifdef K8_NOP1 107#ifdef K8_NOP1
94asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " 108static const unsigned char k8nops[] =
95 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 109{
96 K8_NOP7 K8_NOP8 110 K8_NOP1,
97 "\t.previous"); 111 K8_NOP2,
98extern const unsigned char k8nops[]; 112 K8_NOP3,
99static const unsigned char *const __initconst_or_module 113 K8_NOP4,
100k8_nops[ASM_NOP_MAX+1] = { 114 K8_NOP5,
115 K8_NOP6,
116 K8_NOP7,
117 K8_NOP8,
118 K8_NOP5_ATOMIC
119};
120static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
121{
101 NULL, 122 NULL,
102 k8nops, 123 k8nops,
103 k8nops + 1, 124 k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
107 k8nops + 1 + 2 + 3 + 4 + 5, 128 k8nops + 1 + 2 + 3 + 4 + 5,
108 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 129 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
109 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
131 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
110}; 132};
111#endif 133#endif
112 134
113#if defined(K7_NOP1) && !defined(CONFIG_X86_64) 135#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
114asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " 136static const unsigned char k7nops[] =
115 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 137{
116 K7_NOP7 K7_NOP8 138 K7_NOP1,
117 "\t.previous"); 139 K7_NOP2,
118extern const unsigned char k7nops[]; 140 K7_NOP3,
119static const unsigned char *const __initconst_or_module 141 K7_NOP4,
120k7_nops[ASM_NOP_MAX+1] = { 142 K7_NOP5,
143 K7_NOP6,
144 K7_NOP7,
145 K7_NOP8,
146 K7_NOP5_ATOMIC
147};
148static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
149{
121 NULL, 150 NULL,
122 k7nops, 151 k7nops,
123 k7nops + 1, 152 k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
127 k7nops + 1 + 2 + 3 + 4 + 5, 156 k7nops + 1 + 2 + 3 + 4 + 5,
128 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 157 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
159 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
130}; 160};
131#endif 161#endif
132 162
133#ifdef P6_NOP1 163#ifdef P6_NOP1
134asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " 164static const unsigned char __initconst_or_module p6nops[] =
135 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 165{
136 P6_NOP7 P6_NOP8 166 P6_NOP1,
137 "\t.previous"); 167 P6_NOP2,
138extern const unsigned char p6nops[]; 168 P6_NOP3,
139static const unsigned char *const __initconst_or_module 169 P6_NOP4,
140p6_nops[ASM_NOP_MAX+1] = { 170 P6_NOP5,
171 P6_NOP6,
172 P6_NOP7,
173 P6_NOP8,
174 P6_NOP5_ATOMIC
175};
176static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
177{
141 NULL, 178 NULL,
142 p6nops, 179 p6nops,
143 p6nops + 1, 180 p6nops + 1,
@@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = {
147 p6nops + 1 + 2 + 3 + 4 + 5, 184 p6nops + 1 + 2 + 3 + 4 + 5,
148 p6nops + 1 + 2 + 3 + 4 + 5 + 6, 185 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
149 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
187 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
150}; 188};
151#endif 189#endif
152 190
191/* Initialize these to a safe default */
153#ifdef CONFIG_X86_64 192#ifdef CONFIG_X86_64
193const unsigned char * const *ideal_nops = p6_nops;
194#else
195const unsigned char * const *ideal_nops = intel_nops;
196#endif
154 197
155extern char __vsyscall_0; 198void __init arch_init_ideal_nops(void)
156static const unsigned char *const *__init_or_module find_nop_table(void)
157{ 199{
158 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 200 switch (boot_cpu_data.x86_vendor) {
159 boot_cpu_has(X86_FEATURE_NOPL)) 201 case X86_VENDOR_INTEL:
160 return p6_nops; 202 /*
161 else 203 * Due to a decoder implementation quirk, some
162 return k8_nops; 204 * specific Intel CPUs actually perform better with
163} 205 * the "k8_nops" than with the SDM-recommended NOPs.
164 206 */
165#else /* CONFIG_X86_64 */ 207 if (boot_cpu_data.x86 == 6 &&
208 boot_cpu_data.x86_model >= 0x0f &&
209 boot_cpu_data.x86_model != 0x1c &&
210 boot_cpu_data.x86_model != 0x26 &&
211 boot_cpu_data.x86_model != 0x27 &&
212 boot_cpu_data.x86_model < 0x30) {
213 ideal_nops = k8_nops;
214 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
215 ideal_nops = p6_nops;
216 } else {
217#ifdef CONFIG_X86_64
218 ideal_nops = k8_nops;
219#else
220 ideal_nops = intel_nops;
221#endif
222 }
166 223
167static const unsigned char *const *__init_or_module find_nop_table(void) 224 default:
168{ 225#ifdef CONFIG_X86_64
169 if (boot_cpu_has(X86_FEATURE_K8)) 226 ideal_nops = k8_nops;
170 return k8_nops; 227#else
171 else if (boot_cpu_has(X86_FEATURE_K7)) 228 if (boot_cpu_has(X86_FEATURE_K8))
172 return k7_nops; 229 ideal_nops = k8_nops;
173 else if (boot_cpu_has(X86_FEATURE_NOPL)) 230 else if (boot_cpu_has(X86_FEATURE_K7))
174 return p6_nops; 231 ideal_nops = k7_nops;
175 else 232 else
176 return intel_nops; 233 ideal_nops = intel_nops;
234#endif
235 }
177} 236}
178 237
179#endif /* CONFIG_X86_64 */
180
181/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 238/* Use this to add nops to a buffer, then text_poke the whole buffer. */
182static void __init_or_module add_nops(void *insns, unsigned int len) 239static void __init_or_module add_nops(void *insns, unsigned int len)
183{ 240{
184 const unsigned char *const *noptable = find_nop_table();
185
186 while (len > 0) { 241 while (len > 0) {
187 unsigned int noplen = len; 242 unsigned int noplen = len;
188 if (noplen > ASM_NOP_MAX) 243 if (noplen > ASM_NOP_MAX)
189 noplen = ASM_NOP_MAX; 244 noplen = ASM_NOP_MAX;
190 memcpy(insns, noptable[noplen], noplen); 245 memcpy(insns, ideal_nops[noplen], noplen);
191 insns += noplen; 246 insns += noplen;
192 len -= noplen; 247 len -= noplen;
193 } 248 }
@@ -195,6 +250,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
195 250
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 251extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern s32 __smp_locks[], __smp_locks_end[]; 252extern s32 __smp_locks[], __smp_locks_end[];
253extern char __vsyscall_0;
198void *text_poke_early(void *addr, const void *opcode, size_t len); 254void *text_poke_early(void *addr, const void *opcode, size_t len);
199 255
200/* Replace instructions with better alternatives for this CPU type. 256/* Replace instructions with better alternatives for this CPU type.
@@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
210 u8 insnbuf[MAX_PATCH_LEN]; 266 u8 insnbuf[MAX_PATCH_LEN];
211 267
212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
269 /*
270 * The scan order should be from start to end. A later scanned
271 * alternative code can overwrite a previous scanned alternative code.
272 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
273 * patch code.
274 *
275 * So be careful if you want to change the scan order to any other
276 * order.
277 */
213 for (a = start; a < end; a++) { 278 for (a = start; a < end; a++) {
214 u8 *instr = a->instr; 279 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
@@ -678,29 +743,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
678 wrote_text = 0; 743 wrote_text = 0;
679 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 744 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
680} 745}
681
682#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
683
684#ifdef CONFIG_X86_64
685unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
686#else
687unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
688#endif
689
690void __init arch_init_ideal_nop5(void)
691{
692 /*
693 * There is no good nop for all x86 archs. This selection
694 * algorithm should be unified with the one in find_nop_table(),
695 * but this should be good enough for now.
696 *
697 * For cases other than the ones below, use the safe (as in
698 * always functional) defaults above.
699 */
700#ifdef CONFIG_X86_64
701 /* Don't use these on 32 bits due to broken virtualizers */
702 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
703 memcpy(ideal_nop5, p6_nops[5], 5);
704#endif
705}
706#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 82ada01625b9..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
81#define AGPEXTERN 81#define AGPEXTERN
82#endif 82#endif
83 83
84/* GART can only remap to physical addresses < 1TB */
85#define GART_MAX_PHYS_ADDR (1ULL << 40)
86
84/* backdoor interface to AGP driver */ 87/* backdoor interface to AGP driver */
85AGPEXTERN int agp_memory_reserved; 88AGPEXTERN int agp_memory_reserved;
86AGPEXTERN __u32 *agp_gatt_table; 89AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
212 size_t size, int dir, unsigned long align_mask) 215 size_t size, int dir, unsigned long align_mask)
213{ 216{
214 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); 217 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
215 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 218 unsigned long iommu_page;
216 int i; 219 int i;
217 220
221 if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
222 return bad_dma_addr;
223
224 iommu_page = alloc_iommu(dev, npages, align_mask);
218 if (iommu_page == -1) { 225 if (iommu_page == -1) {
219 if (!nonforced_iommu(dev, phys_mem, size)) 226 if (!nonforced_iommu(dev, phys_mem, size))
220 return phys_mem; 227 return phys_mem;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 57ca77787220..cd8cbeb5fa34 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/pci-ats.h>
21#include <linux/bitmap.h> 22#include <linux/bitmap.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -25,6 +26,7 @@
25#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
26#include <linux/iommu-helper.h> 27#include <linux/iommu-helper.h>
27#include <linux/iommu.h> 28#include <linux/iommu.h>
29#include <linux/delay.h>
28#include <asm/proto.h> 30#include <asm/proto.h>
29#include <asm/iommu.h> 31#include <asm/iommu.h>
30#include <asm/gart.h> 32#include <asm/gart.h>
@@ -34,7 +36,7 @@
34 36
35#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 37#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
36 38
37#define EXIT_LOOP_COUNT 10000000 39#define LOOP_TIMEOUT 100000
38 40
39static DEFINE_RWLOCK(amd_iommu_devtable_lock); 41static DEFINE_RWLOCK(amd_iommu_devtable_lock);
40 42
@@ -57,7 +59,6 @@ struct iommu_cmd {
57 u32 data[4]; 59 u32 data[4];
58}; 60};
59 61
60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
61static void update_domain(struct protection_domain *domain); 62static void update_domain(struct protection_domain *domain);
62 63
63/**************************************************************************** 64/****************************************************************************
@@ -322,8 +323,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
322 break; 323 break;
323 case EVENT_TYPE_ILL_CMD: 324 case EVENT_TYPE_ILL_CMD:
324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 325 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 dump_command(address); 326 dump_command(address);
328 break; 327 break;
329 case EVENT_TYPE_CMD_HARD_ERR: 328 case EVENT_TYPE_CMD_HARD_ERR:
@@ -367,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
367 spin_unlock_irqrestore(&iommu->lock, flags); 366 spin_unlock_irqrestore(&iommu->lock, flags);
368} 367}
369 368
370irqreturn_t amd_iommu_int_handler(int irq, void *data) 369irqreturn_t amd_iommu_int_thread(int irq, void *data)
371{ 370{
372 struct amd_iommu *iommu; 371 struct amd_iommu *iommu;
373 372
@@ -377,192 +376,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
377 return IRQ_HANDLED; 376 return IRQ_HANDLED;
378} 377}
379 378
379irqreturn_t amd_iommu_int_handler(int irq, void *data)
380{
381 return IRQ_WAKE_THREAD;
382}
383
380/**************************************************************************** 384/****************************************************************************
381 * 385 *
382 * IOMMU command queuing functions 386 * IOMMU command queuing functions
383 * 387 *
384 ****************************************************************************/ 388 ****************************************************************************/
385 389
386/* 390static int wait_on_sem(volatile u64 *sem)
387 * Writes the command to the IOMMUs command buffer and informs the 391{
388 * hardware about the new command. Must be called with iommu->lock held. 392 int i = 0;
389 */ 393
390static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 394 while (*sem == 0 && i < LOOP_TIMEOUT) {
395 udelay(1);
396 i += 1;
397 }
398
399 if (i == LOOP_TIMEOUT) {
400 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
401 return -EIO;
402 }
403
404 return 0;
405}
406
407static void copy_cmd_to_buffer(struct amd_iommu *iommu,
408 struct iommu_cmd *cmd,
409 u32 tail)
391{ 410{
392 u32 tail, head;
393 u8 *target; 411 u8 *target;
394 412
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
397 target = iommu->cmd_buf + tail; 413 target = iommu->cmd_buf + tail;
398 memcpy_toio(target, cmd, sizeof(*cmd)); 414 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
399 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 415
400 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 416 /* Copy command to buffer */
401 if (tail == head) 417 memcpy(target, cmd, sizeof(*cmd));
402 return -ENOMEM; 418
419 /* Tell the IOMMU about it */
403 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 420 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
421}
404 422
405 return 0; 423static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
424{
425 WARN_ON(address & 0x7ULL);
426
427 memset(cmd, 0, sizeof(*cmd));
428 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
429 cmd->data[1] = upper_32_bits(__pa(address));
430 cmd->data[2] = 1;
431 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
432}
433
434static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
435{
436 memset(cmd, 0, sizeof(*cmd));
437 cmd->data[0] = devid;
438 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
439}
440
441static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
442 size_t size, u16 domid, int pde)
443{
444 u64 pages;
445 int s;
446
447 pages = iommu_num_pages(address, size, PAGE_SIZE);
448 s = 0;
449
450 if (pages > 1) {
451 /*
452 * If we have to flush more than one page, flush all
453 * TLB entries for this domain
454 */
455 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
456 s = 1;
457 }
458
459 address &= PAGE_MASK;
460
461 memset(cmd, 0, sizeof(*cmd));
462 cmd->data[1] |= domid;
463 cmd->data[2] = lower_32_bits(address);
464 cmd->data[3] = upper_32_bits(address);
465 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
466 if (s) /* size bit - we flush more than one 4kb page */
467 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
468 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
469 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
470}
471
472static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
473 u64 address, size_t size)
474{
475 u64 pages;
476 int s;
477
478 pages = iommu_num_pages(address, size, PAGE_SIZE);
479 s = 0;
480
481 if (pages > 1) {
482 /*
483 * If we have to flush more than one page, flush all
484 * TLB entries for this domain
485 */
486 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
487 s = 1;
488 }
489
490 address &= PAGE_MASK;
491
492 memset(cmd, 0, sizeof(*cmd));
493 cmd->data[0] = devid;
494 cmd->data[0] |= (qdep & 0xff) << 24;
495 cmd->data[1] = devid;
496 cmd->data[2] = lower_32_bits(address);
497 cmd->data[3] = upper_32_bits(address);
498 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
499 if (s)
500 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
501}
502
503static void build_inv_all(struct iommu_cmd *cmd)
504{
505 memset(cmd, 0, sizeof(*cmd));
506 CMD_SET_TYPE(cmd, CMD_INV_ALL);
406} 507}
407 508
408/* 509/*
409 * General queuing function for commands. Takes iommu->lock and calls 510 * Writes the command to the IOMMUs command buffer and informs the
410 * __iommu_queue_command(). 511 * hardware about the new command.
411 */ 512 */
412static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 513static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
413{ 514{
515 u32 left, tail, head, next_tail;
414 unsigned long flags; 516 unsigned long flags;
415 int ret;
416 517
518 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
519
520again:
417 spin_lock_irqsave(&iommu->lock, flags); 521 spin_lock_irqsave(&iommu->lock, flags);
418 ret = __iommu_queue_command(iommu, cmd);
419 if (!ret)
420 iommu->need_sync = true;
421 spin_unlock_irqrestore(&iommu->lock, flags);
422 522
423 return ret; 523 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
424} 524 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
525 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
526 left = (head - next_tail) % iommu->cmd_buf_size;
425 527
426/* 528 if (left <= 2) {
427 * This function waits until an IOMMU has completed a completion 529 struct iommu_cmd sync_cmd;
428 * wait command 530 volatile u64 sem = 0;
429 */ 531 int ret;
430static void __iommu_wait_for_completion(struct amd_iommu *iommu) 532
431{ 533 build_completion_wait(&sync_cmd, (u64)&sem);
432 int ready = 0; 534 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
433 unsigned status = 0;
434 unsigned long i = 0;
435 535
436 INC_STATS_COUNTER(compl_wait); 536 spin_unlock_irqrestore(&iommu->lock, flags);
537
538 if ((ret = wait_on_sem(&sem)) != 0)
539 return ret;
437 540
438 while (!ready && (i < EXIT_LOOP_COUNT)) { 541 goto again;
439 ++i;
440 /* wait for the bit to become one */
441 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
442 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
443 } 542 }
444 543
445 /* set bit back to zero */ 544 copy_cmd_to_buffer(iommu, cmd, tail);
446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 545
447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 546 /* We need to sync now to make sure all commands are processed */
547 iommu->need_sync = true;
548
549 spin_unlock_irqrestore(&iommu->lock, flags);
448 550
449 if (unlikely(i == EXIT_LOOP_COUNT)) 551 return 0;
450 iommu->reset_in_progress = true;
451} 552}
452 553
453/* 554/*
454 * This function queues a completion wait command into the command 555 * This function queues a completion wait command into the command
455 * buffer of an IOMMU 556 * buffer of an IOMMU
456 */ 557 */
457static int __iommu_completion_wait(struct amd_iommu *iommu) 558static int iommu_completion_wait(struct amd_iommu *iommu)
458{ 559{
459 struct iommu_cmd cmd; 560 struct iommu_cmd cmd;
561 volatile u64 sem = 0;
562 int ret;
563
564 if (!iommu->need_sync)
565 return 0;
460 566
461 memset(&cmd, 0, sizeof(cmd)); 567 build_completion_wait(&cmd, (u64)&sem);
462 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
463 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
464 568
465 return __iommu_queue_command(iommu, &cmd); 569 ret = iommu_queue_command(iommu, &cmd);
570 if (ret)
571 return ret;
572
573 return wait_on_sem(&sem);
466} 574}
467 575
468/* 576static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
469 * This function is called whenever we need to ensure that the IOMMU has
470 * completed execution of all commands we sent. It sends a
471 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
472 * us about that by writing a value to a physical address we pass with
473 * the command.
474 */
475static int iommu_completion_wait(struct amd_iommu *iommu)
476{ 577{
477 int ret = 0; 578 struct iommu_cmd cmd;
478 unsigned long flags;
479
480 spin_lock_irqsave(&iommu->lock, flags);
481 579
482 if (!iommu->need_sync) 580 build_inv_dte(&cmd, devid);
483 goto out;
484 581
485 ret = __iommu_completion_wait(iommu); 582 return iommu_queue_command(iommu, &cmd);
583}
486 584
487 iommu->need_sync = false; 585static void iommu_flush_dte_all(struct amd_iommu *iommu)
586{
587 u32 devid;
488 588
489 if (ret) 589 for (devid = 0; devid <= 0xffff; ++devid)
490 goto out; 590 iommu_flush_dte(iommu, devid);
491 591
492 __iommu_wait_for_completion(iommu); 592 iommu_completion_wait(iommu);
593}
493 594
494out: 595/*
495 spin_unlock_irqrestore(&iommu->lock, flags); 596 * This function uses heavy locking and may disable irqs for some time. But
597 * this is no issue because it is only called during resume.
598 */
599static void iommu_flush_tlb_all(struct amd_iommu *iommu)
600{
601 u32 dom_id;
496 602
497 if (iommu->reset_in_progress) 603 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
498 reset_iommu_command_buffer(iommu); 604 struct iommu_cmd cmd;
605 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
606 dom_id, 1);
607 iommu_queue_command(iommu, &cmd);
608 }
499 609
500 return 0; 610 iommu_completion_wait(iommu);
501} 611}
502 612
503static void iommu_flush_complete(struct protection_domain *domain) 613static void iommu_flush_all(struct amd_iommu *iommu)
504{ 614{
505 int i; 615 struct iommu_cmd cmd;
506 616
507 for (i = 0; i < amd_iommus_present; ++i) { 617 build_inv_all(&cmd);
508 if (!domain->dev_iommu[i])
509 continue;
510 618
511 /* 619 iommu_queue_command(iommu, &cmd);
512 * Devices of this domain are behind this IOMMU 620 iommu_completion_wait(iommu);
513 * We need to wait for completion of all commands. 621}
514 */ 622
515 iommu_completion_wait(amd_iommus[i]); 623void iommu_flush_all_caches(struct amd_iommu *iommu)
624{
625 if (iommu_feature(iommu, FEATURE_IA)) {
626 iommu_flush_all(iommu);
627 } else {
628 iommu_flush_dte_all(iommu);
629 iommu_flush_tlb_all(iommu);
516 } 630 }
517} 631}
518 632
519/* 633/*
520 * Command send function for invalidating a device table entry 634 * Command send function for flushing on-device TLB
521 */ 635 */
522static int iommu_flush_device(struct device *dev) 636static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
523{ 637{
638 struct pci_dev *pdev = to_pci_dev(dev);
524 struct amd_iommu *iommu; 639 struct amd_iommu *iommu;
525 struct iommu_cmd cmd; 640 struct iommu_cmd cmd;
526 u16 devid; 641 u16 devid;
642 int qdep;
527 643
644 qdep = pci_ats_queue_depth(pdev);
528 devid = get_device_id(dev); 645 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid]; 646 iommu = amd_iommu_rlookup_table[devid];
530 647
531 /* Build command */ 648 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
532 memset(&cmd, 0, sizeof(cmd));
533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
534 cmd.data[0] = devid;
535 649
536 return iommu_queue_command(iommu, &cmd); 650 return iommu_queue_command(iommu, &cmd);
537} 651}
538 652
539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
540 u16 domid, int pde, int s)
541{
542 memset(cmd, 0, sizeof(*cmd));
543 address &= PAGE_MASK;
544 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
545 cmd->data[1] |= domid;
546 cmd->data[2] = lower_32_bits(address);
547 cmd->data[3] = upper_32_bits(address);
548 if (s) /* size bit - we flush more than one 4kb page */
549 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
550 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
551 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
552}
553
554/* 653/*
555 * Generic command send function for invalidaing TLB entries 654 * Command send function for invalidating a device table entry
556 */ 655 */
557static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 656static int device_flush_dte(struct device *dev)
558 u64 address, u16 domid, int pde, int s)
559{ 657{
560 struct iommu_cmd cmd; 658 struct amd_iommu *iommu;
659 struct pci_dev *pdev;
660 u16 devid;
561 int ret; 661 int ret;
562 662
563 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); 663 pdev = to_pci_dev(dev);
664 devid = get_device_id(dev);
665 iommu = amd_iommu_rlookup_table[devid];
564 666
565 ret = iommu_queue_command(iommu, &cmd); 667 ret = iommu_flush_dte(iommu, devid);
668 if (ret)
669 return ret;
670
671 if (pci_ats_enabled(pdev))
672 ret = device_flush_iotlb(dev, 0, ~0UL);
566 673
567 return ret; 674 return ret;
568} 675}
@@ -572,23 +679,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
572 * It invalidates a single PTE if the range to flush is within a single 679 * It invalidates a single PTE if the range to flush is within a single
573 * page. Otherwise it flushes the whole TLB of the IOMMU. 680 * page. Otherwise it flushes the whole TLB of the IOMMU.
574 */ 681 */
575static void __iommu_flush_pages(struct protection_domain *domain, 682static void __domain_flush_pages(struct protection_domain *domain,
576 u64 address, size_t size, int pde) 683 u64 address, size_t size, int pde)
577{ 684{
578 int s = 0, i; 685 struct iommu_dev_data *dev_data;
579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); 686 struct iommu_cmd cmd;
580 687 int ret = 0, i;
581 address &= PAGE_MASK;
582
583 if (pages > 1) {
584 /*
585 * If we have to flush more than one page, flush all
586 * TLB entries for this domain
587 */
588 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
589 s = 1;
590 }
591 688
689 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
592 690
593 for (i = 0; i < amd_iommus_present; ++i) { 691 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i]) 692 if (!domain->dev_iommu[i])
@@ -598,101 +696,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
598 * Devices of this domain are behind this IOMMU 696 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush 697 * We need a TLB flush
600 */ 698 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address, 699 ret |= iommu_queue_command(amd_iommus[i], &cmd);
602 domain->id, pde, s); 700 }
701
702 list_for_each_entry(dev_data, &domain->dev_list, list) {
703 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
704
705 if (!pci_ats_enabled(pdev))
706 continue;
707
708 ret |= device_flush_iotlb(dev_data->dev, address, size);
603 } 709 }
604 710
605 return; 711 WARN_ON(ret);
606} 712}
607 713
608static void iommu_flush_pages(struct protection_domain *domain, 714static void domain_flush_pages(struct protection_domain *domain,
609 u64 address, size_t size) 715 u64 address, size_t size)
610{ 716{
611 __iommu_flush_pages(domain, address, size, 0); 717 __domain_flush_pages(domain, address, size, 0);
612} 718}
613 719
614/* Flush the whole IO/TLB for a given protection domain */ 720/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain) 721static void domain_flush_tlb(struct protection_domain *domain)
616{ 722{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 723 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
618} 724}
619 725
620/* Flush the whole IO/TLB for a given protection domain - including PDE */ 726/* Flush the whole IO/TLB for a given protection domain - including PDE */
621static void iommu_flush_tlb_pde(struct protection_domain *domain) 727static void domain_flush_tlb_pde(struct protection_domain *domain)
622{ 728{
623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); 729 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
624}
625
626
627/*
628 * This function flushes the DTEs for all devices in domain
629 */
630static void iommu_flush_domain_devices(struct protection_domain *domain)
631{
632 struct iommu_dev_data *dev_data;
633 unsigned long flags;
634
635 spin_lock_irqsave(&domain->lock, flags);
636
637 list_for_each_entry(dev_data, &domain->dev_list, list)
638 iommu_flush_device(dev_data->dev);
639
640 spin_unlock_irqrestore(&domain->lock, flags);
641} 730}
642 731
643static void iommu_flush_all_domain_devices(void) 732static void domain_flush_complete(struct protection_domain *domain)
644{ 733{
645 struct protection_domain *domain; 734 int i;
646 unsigned long flags;
647 735
648 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 736 for (i = 0; i < amd_iommus_present; ++i) {
737 if (!domain->dev_iommu[i])
738 continue;
649 739
650 list_for_each_entry(domain, &amd_iommu_pd_list, list) { 740 /*
651 iommu_flush_domain_devices(domain); 741 * Devices of this domain are behind this IOMMU
652 iommu_flush_complete(domain); 742 * We need to wait for completion of all commands.
743 */
744 iommu_completion_wait(amd_iommus[i]);
653 } 745 }
654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656} 746}
657 747
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
661}
662 748
663/* 749/*
664 * This function uses heavy locking and may disable irqs for some time. But 750 * This function flushes the DTEs for all devices in domain
665 * this is no issue because it is only called during resume.
666 */ 751 */
667void amd_iommu_flush_all_domains(void) 752static void domain_flush_devices(struct protection_domain *domain)
668{ 753{
669 struct protection_domain *domain; 754 struct iommu_dev_data *dev_data;
670 unsigned long flags; 755 unsigned long flags;
671 756
672 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 757 spin_lock_irqsave(&domain->lock, flags);
673
674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
682}
683
684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
685{
686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
687
688 if (iommu->reset_in_progress)
689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690 758
691 amd_iommu_reset_cmd_buffer(iommu); 759 list_for_each_entry(dev_data, &domain->dev_list, list)
692 amd_iommu_flush_all_devices(); 760 device_flush_dte(dev_data->dev);
693 amd_iommu_flush_all_domains();
694 761
695 iommu->reset_in_progress = false; 762 spin_unlock_irqrestore(&domain->lock, flags);
696} 763}
697 764
698/**************************************************************************** 765/****************************************************************************
@@ -1410,17 +1477,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
1410 return domain->flags & PD_DMA_OPS_MASK; 1477 return domain->flags & PD_DMA_OPS_MASK;
1411} 1478}
1412 1479
1413static void set_dte_entry(u16 devid, struct protection_domain *domain) 1480static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1414{ 1481{
1415 u64 pte_root = virt_to_phys(domain->pt_root); 1482 u64 pte_root = virt_to_phys(domain->pt_root);
1483 u32 flags = 0;
1416 1484
1417 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1485 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1418 << DEV_ENTRY_MODE_SHIFT; 1486 << DEV_ENTRY_MODE_SHIFT;
1419 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 1487 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1420 1488
1421 amd_iommu_dev_table[devid].data[2] = domain->id; 1489 if (ats)
1422 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1490 flags |= DTE_FLAG_IOTLB;
1423 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1491
1492 amd_iommu_dev_table[devid].data[3] |= flags;
1493 amd_iommu_dev_table[devid].data[2] = domain->id;
1494 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1495 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1424} 1496}
1425 1497
1426static void clear_dte_entry(u16 devid) 1498static void clear_dte_entry(u16 devid)
@@ -1437,23 +1509,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
1437{ 1509{
1438 struct iommu_dev_data *dev_data; 1510 struct iommu_dev_data *dev_data;
1439 struct amd_iommu *iommu; 1511 struct amd_iommu *iommu;
1512 struct pci_dev *pdev;
1513 bool ats = false;
1440 u16 devid; 1514 u16 devid;
1441 1515
1442 devid = get_device_id(dev); 1516 devid = get_device_id(dev);
1443 iommu = amd_iommu_rlookup_table[devid]; 1517 iommu = amd_iommu_rlookup_table[devid];
1444 dev_data = get_dev_data(dev); 1518 dev_data = get_dev_data(dev);
1519 pdev = to_pci_dev(dev);
1520
1521 if (amd_iommu_iotlb_sup)
1522 ats = pci_ats_enabled(pdev);
1445 1523
1446 /* Update data structures */ 1524 /* Update data structures */
1447 dev_data->domain = domain; 1525 dev_data->domain = domain;
1448 list_add(&dev_data->list, &domain->dev_list); 1526 list_add(&dev_data->list, &domain->dev_list);
1449 set_dte_entry(devid, domain); 1527 set_dte_entry(devid, domain, ats);
1450 1528
1451 /* Do reference counting */ 1529 /* Do reference counting */
1452 domain->dev_iommu[iommu->index] += 1; 1530 domain->dev_iommu[iommu->index] += 1;
1453 domain->dev_cnt += 1; 1531 domain->dev_cnt += 1;
1454 1532
1455 /* Flush the DTE entry */ 1533 /* Flush the DTE entry */
1456 iommu_flush_device(dev); 1534 device_flush_dte(dev);
1457} 1535}
1458 1536
1459static void do_detach(struct device *dev) 1537static void do_detach(struct device *dev)
@@ -1476,7 +1554,7 @@ static void do_detach(struct device *dev)
1476 clear_dte_entry(devid); 1554 clear_dte_entry(devid);
1477 1555
1478 /* Flush the DTE entry */ 1556 /* Flush the DTE entry */
1479 iommu_flush_device(dev); 1557 device_flush_dte(dev);
1480} 1558}
1481 1559
1482/* 1560/*
@@ -1539,9 +1617,13 @@ out_unlock:
1539static int attach_device(struct device *dev, 1617static int attach_device(struct device *dev,
1540 struct protection_domain *domain) 1618 struct protection_domain *domain)
1541{ 1619{
1620 struct pci_dev *pdev = to_pci_dev(dev);
1542 unsigned long flags; 1621 unsigned long flags;
1543 int ret; 1622 int ret;
1544 1623
1624 if (amd_iommu_iotlb_sup)
1625 pci_enable_ats(pdev, PAGE_SHIFT);
1626
1545 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1627 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1546 ret = __attach_device(dev, domain); 1628 ret = __attach_device(dev, domain);
1547 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1629 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1633,7 @@ static int attach_device(struct device *dev,
1551 * left the caches in the IOMMU dirty. So we have to flush 1633 * left the caches in the IOMMU dirty. So we have to flush
1552 * here to evict all dirty stuff. 1634 * here to evict all dirty stuff.
1553 */ 1635 */
1554 iommu_flush_tlb_pde(domain); 1636 domain_flush_tlb_pde(domain);
1555 1637
1556 return ret; 1638 return ret;
1557} 1639}
@@ -1598,12 +1680,16 @@ static void __detach_device(struct device *dev)
1598 */ 1680 */
1599static void detach_device(struct device *dev) 1681static void detach_device(struct device *dev)
1600{ 1682{
1683 struct pci_dev *pdev = to_pci_dev(dev);
1601 unsigned long flags; 1684 unsigned long flags;
1602 1685
1603 /* lock device table */ 1686 /* lock device table */
1604 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1687 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1605 __detach_device(dev); 1688 __detach_device(dev);
1606 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1689 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1690
1691 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1692 pci_disable_ats(pdev);
1607} 1693}
1608 1694
1609/* 1695/*
@@ -1615,10 +1701,9 @@ static struct protection_domain *domain_for_device(struct device *dev)
1615 struct protection_domain *dom; 1701 struct protection_domain *dom;
1616 struct iommu_dev_data *dev_data, *alias_data; 1702 struct iommu_dev_data *dev_data, *alias_data;
1617 unsigned long flags; 1703 unsigned long flags;
1618 u16 devid, alias; 1704 u16 devid;
1619 1705
1620 devid = get_device_id(dev); 1706 devid = get_device_id(dev);
1621 alias = amd_iommu_alias_table[devid];
1622 dev_data = get_dev_data(dev); 1707 dev_data = get_dev_data(dev);
1623 alias_data = get_dev_data(dev_data->alias); 1708 alias_data = get_dev_data(dev_data->alias);
1624 if (!alias_data) 1709 if (!alias_data)
@@ -1692,7 +1777,7 @@ static int device_change_notifier(struct notifier_block *nb,
1692 goto out; 1777 goto out;
1693 } 1778 }
1694 1779
1695 iommu_flush_device(dev); 1780 device_flush_dte(dev);
1696 iommu_completion_wait(iommu); 1781 iommu_completion_wait(iommu);
1697 1782
1698out: 1783out:
@@ -1753,8 +1838,9 @@ static void update_device_table(struct protection_domain *domain)
1753 struct iommu_dev_data *dev_data; 1838 struct iommu_dev_data *dev_data;
1754 1839
1755 list_for_each_entry(dev_data, &domain->dev_list, list) { 1840 list_for_each_entry(dev_data, &domain->dev_list, list) {
1841 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1756 u16 devid = get_device_id(dev_data->dev); 1842 u16 devid = get_device_id(dev_data->dev);
1757 set_dte_entry(devid, domain); 1843 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1758 } 1844 }
1759} 1845}
1760 1846
@@ -1764,8 +1850,9 @@ static void update_domain(struct protection_domain *domain)
1764 return; 1850 return;
1765 1851
1766 update_device_table(domain); 1852 update_device_table(domain);
1767 iommu_flush_domain_devices(domain); 1853
1768 iommu_flush_tlb_pde(domain); 1854 domain_flush_devices(domain);
1855 domain_flush_tlb_pde(domain);
1769 1856
1770 domain->updated = false; 1857 domain->updated = false;
1771} 1858}
@@ -1924,10 +2011,10 @@ retry:
1924 ADD_STATS_COUNTER(alloced_io_mem, size); 2011 ADD_STATS_COUNTER(alloced_io_mem, size);
1925 2012
1926 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2013 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1927 iommu_flush_tlb(&dma_dom->domain); 2014 domain_flush_tlb(&dma_dom->domain);
1928 dma_dom->need_flush = false; 2015 dma_dom->need_flush = false;
1929 } else if (unlikely(amd_iommu_np_cache)) 2016 } else if (unlikely(amd_iommu_np_cache))
1930 iommu_flush_pages(&dma_dom->domain, address, size); 2017 domain_flush_pages(&dma_dom->domain, address, size);
1931 2018
1932out: 2019out:
1933 return address; 2020 return address;
@@ -1976,7 +2063,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1976 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2063 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1977 2064
1978 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2065 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1979 iommu_flush_pages(&dma_dom->domain, flush_addr, size); 2066 domain_flush_pages(&dma_dom->domain, flush_addr, size);
1980 dma_dom->need_flush = false; 2067 dma_dom->need_flush = false;
1981 } 2068 }
1982} 2069}
@@ -2012,7 +2099,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
2012 if (addr == DMA_ERROR_CODE) 2099 if (addr == DMA_ERROR_CODE)
2013 goto out; 2100 goto out;
2014 2101
2015 iommu_flush_complete(domain); 2102 domain_flush_complete(domain);
2016 2103
2017out: 2104out:
2018 spin_unlock_irqrestore(&domain->lock, flags); 2105 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2126,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2039 2126
2040 __unmap_single(domain->priv, dma_addr, size, dir); 2127 __unmap_single(domain->priv, dma_addr, size, dir);
2041 2128
2042 iommu_flush_complete(domain); 2129 domain_flush_complete(domain);
2043 2130
2044 spin_unlock_irqrestore(&domain->lock, flags); 2131 spin_unlock_irqrestore(&domain->lock, flags);
2045} 2132}
@@ -2104,7 +2191,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
2104 goto unmap; 2191 goto unmap;
2105 } 2192 }
2106 2193
2107 iommu_flush_complete(domain); 2194 domain_flush_complete(domain);
2108 2195
2109out: 2196out:
2110 spin_unlock_irqrestore(&domain->lock, flags); 2197 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2237,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2150 s->dma_address = s->dma_length = 0; 2237 s->dma_address = s->dma_length = 0;
2151 } 2238 }
2152 2239
2153 iommu_flush_complete(domain); 2240 domain_flush_complete(domain);
2154 2241
2155 spin_unlock_irqrestore(&domain->lock, flags); 2242 spin_unlock_irqrestore(&domain->lock, flags);
2156} 2243}
@@ -2200,7 +2287,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
2200 goto out_free; 2287 goto out_free;
2201 } 2288 }
2202 2289
2203 iommu_flush_complete(domain); 2290 domain_flush_complete(domain);
2204 2291
2205 spin_unlock_irqrestore(&domain->lock, flags); 2292 spin_unlock_irqrestore(&domain->lock, flags);
2206 2293
@@ -2232,7 +2319,7 @@ static void free_coherent(struct device *dev, size_t size,
2232 2319
2233 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2320 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2234 2321
2235 iommu_flush_complete(domain); 2322 domain_flush_complete(domain);
2236 2323
2237 spin_unlock_irqrestore(&domain->lock, flags); 2324 spin_unlock_irqrestore(&domain->lock, flags);
2238 2325
@@ -2476,7 +2563,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
2476 if (!iommu) 2563 if (!iommu)
2477 return; 2564 return;
2478 2565
2479 iommu_flush_device(dev); 2566 device_flush_dte(dev);
2480 iommu_completion_wait(iommu); 2567 iommu_completion_wait(iommu);
2481} 2568}
2482 2569
@@ -2542,7 +2629,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2542 unmap_size = iommu_unmap_page(domain, iova, page_size); 2629 unmap_size = iommu_unmap_page(domain, iova, page_size);
2543 mutex_unlock(&domain->api_lock); 2630 mutex_unlock(&domain->api_lock);
2544 2631
2545 iommu_flush_tlb_pde(domain); 2632 domain_flush_tlb_pde(domain);
2546 2633
2547 return get_order(unmap_size); 2634 return get_order(unmap_size);
2548} 2635}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 246d727b65b7..9179c21120a8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -137,6 +137,7 @@ int amd_iommus_present;
137 137
138/* IOMMUs have a non-present cache? */ 138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly; 139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
140 141
141/* 142/*
142 * The ACPI table parsing functions set this variable on an error 143 * The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */
180static u32 alias_table_size; /* size of the alias table */ 181static u32 alias_table_size; /* size of the alias table */
181static u32 rlookup_table_size; /* size if the rlookup table */ 182static u32 rlookup_table_size; /* size if the rlookup table */
182 183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
183static inline void update_last_devid(u16 devid) 190static inline void update_last_devid(u16 devid)
184{ 191{
185 if (devid > amd_iommu_last_bdf) 192 if (devid > amd_iommu_last_bdf)
@@ -293,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
293/* Function to enable the hardware */ 300/* Function to enable the hardware */
294static void iommu_enable(struct amd_iommu *iommu) 301static void iommu_enable(struct amd_iommu *iommu)
295{ 302{
296 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", 303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
297 dev_name(&iommu->dev->dev), iommu->cap_ptr); 310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
298 311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
299 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
300} 321}
301 322
@@ -651,7 +672,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
651static void __init init_iommu_from_pci(struct amd_iommu *iommu) 672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
652{ 673{
653 int cap_ptr = iommu->cap_ptr; 674 int cap_ptr = iommu->cap_ptr;
654 u32 range, misc; 675 u32 range, misc, low, high;
655 int i, j; 676 int i, j;
656 677
657 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
@@ -667,6 +688,15 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
667 MMIO_GET_LD(range)); 688 MMIO_GET_LD(range));
668 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
669 690
691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
692 amd_iommu_iotlb_sup = false;
693
694 /* read extended feature bits */
695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
670 if (!is_rd890_iommu(iommu->dev)) 700 if (!is_rd890_iommu(iommu->dev))
671 return; 701 return;
672 702
@@ -1004,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
1004 if (pci_enable_msi(iommu->dev)) 1034 if (pci_enable_msi(iommu->dev))
1005 return 1; 1035 return 1;
1006 1036
1007 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 1037 r = request_threaded_irq(iommu->dev->irq,
1008 IRQF_SAMPLE_RANDOM, 1038 amd_iommu_int_handler,
1009 "AMD-Vi", 1039 amd_iommu_int_thread,
1010 NULL); 1040 0, "AMD-Vi",
1041 iommu->dev);
1011 1042
1012 if (r) { 1043 if (r) {
1013 pci_disable_msi(iommu->dev); 1044 pci_disable_msi(iommu->dev);
@@ -1244,6 +1275,7 @@ static void enable_iommus(void)
1244 iommu_set_exclusion_range(iommu); 1275 iommu_set_exclusion_range(iommu);
1245 iommu_init_msi(iommu); 1276 iommu_init_msi(iommu);
1246 iommu_enable(iommu); 1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1247 } 1279 }
1248} 1280}
1249 1281
@@ -1274,8 +1306,8 @@ static void amd_iommu_resume(void)
1274 * we have to flush after the IOMMUs are enabled because a 1306 * we have to flush after the IOMMUs are enabled because a
1275 * disabled IOMMU will never execute the commands we send 1307 * disabled IOMMU will never execute the commands we send
1276 */ 1308 */
1277 amd_iommu_flush_all_devices(); 1309 for_each_iommu(iommu)
1278 amd_iommu_flush_all_domains(); 1310 iommu_flush_all_caches(iommu);
1279} 1311}
1280 1312
1281static int amd_iommu_suspend(void) 1313static int amd_iommu_suspend(void)
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee22..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
177 .rating = APBT_CLOCKSOURCE_RATING, 177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource, 178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK, 179 .mask = APBT_MASK,
180 .shift = APBT_SHIFT,
181 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
182 .resume = apbt_restart_clocksource, 181 .resume = apbt_restart_clocksource,
183}; 182};
@@ -543,14 +542,7 @@ static int apbt_clocksource_register(void)
543 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 542 if (t1 == apbt_read_clocksource(&clocksource_apbt))
544 panic("APBT counter not counting. APBT disabled\n"); 543 panic("APBT counter not counting. APBT disabled\n");
545 544
546 /* 545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
547 * initialize and register APBT clocksource
548 * convert that to ns/clock cycle
549 * mult = (ns/c) * 2^APBT_SHIFT
550 */
551 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
552 (unsigned long) apbt_freq, APBT_SHIFT);
553 clocksource_register(&clocksource_apbt);
554 546
555 return 0; 547 return 0;
556} 548}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 86d1ad4962a7..3d2661ca6542 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
30#include <asm/amd_nb.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33/*
34 * Using 512M as goal, in case kexec will load kernel_big
35 * that will do the on-position decompress, and could overlap with
36 * with the gart aperture that is used.
37 * Sequence:
38 * kernel_small
39 * ==> kexec (with kdump trigger path or gart still enabled)
40 * ==> kernel_small (gart area become e820_reserved)
41 * ==> kexec (with kdump trigger path or gart still enabled)
42 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
43 * So don't use 512M below as gart iommu, leave the space for kernel
44 * code for safe.
45 */
46#define GART_MIN_ADDR (512ULL << 20)
47#define GART_MAX_ADDR (1ULL << 32)
48
33int gart_iommu_aperture; 49int gart_iommu_aperture;
34int gart_iommu_aperture_disabled __initdata; 50int gart_iommu_aperture_disabled __initdata;
35int gart_iommu_aperture_allowed __initdata; 51int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
70 * memory. Unfortunately we cannot move it up because that would 86 * memory. Unfortunately we cannot move it up because that would
71 * make the IOMMU useless. 87 * make the IOMMU useless.
72 */ 88 */
73 /* 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
74 * using 512M as goal, in case kexec will load kernel_big 90 aper_size, aper_size);
75 * that will do the on position decompress, and could overlap with 91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
76 * that position with gart that is used.
77 * sequende:
78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
80 * ==> kernel_small(gart area become e820_reserved)
81 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
82 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe
85 */
86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR 92 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n", 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10); 94 addr, aper_size>>10);
@@ -499,7 +503,7 @@ out:
499 * Don't enable translation yet but enable GART IO and CPU 503 * Don't enable translation yet but enable GART IO and CPU
500 * accesses and set DISTLBWALKPRB since GART table memory is UC. 504 * accesses and set DISTLBWALKPRB since GART table memory is UC.
501 */ 505 */
502 u32 ctl = DISTLBWALKPRB | aper_order << 1; 506 u32 ctl = aper_order << 1;
503 507
504 bus = amd_nb_bus_dev_ranges[i].bus; 508 bus = amd_nb_bus_dev_ranges[i].bus;
505 dev_base = amd_nb_bus_dev_ranges[i].dev_base; 509 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3966b564ea47..767fd04f2843 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,20 +2,25 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
6obj-y += hw_nmi.o 6obj-y += hw_nmi.o
7 7
8obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
9obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12obj-y += apic_flat_64.o 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
16obj-y += apic_flat_64.o
16endif 17endif
17 18
18obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o 19# APIC probe will depend on the listing order here
19obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 20obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
20obj-$(CONFIG_X86_ES7000) += es7000_32.o
21obj-$(CONFIG_X86_SUMMIT) += summit_32.o 21obj-$(CONFIG_X86_SUMMIT) += summit_32.o
22obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24
25# For 32bit, probe_32 need to be listed last
26obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff771..b961af86bfea 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
505{ 505{
506 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 506 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
507 507
508 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { 508 if (this_cpu_has(X86_FEATURE_ARAT)) {
509 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 509 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
510 /* Make LAPIC timer preferrable over percpu HPET */ 510 /* Make LAPIC timer preferrable over percpu HPET */
511 lapic_clockevent.rating = 150; 511 lapic_clockevent.rating = 150;
@@ -1237,6 +1237,17 @@ void __cpuinit setup_local_APIC(void)
1237 /* always use the value from LDR */ 1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) = 1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id(); 1239 logical_smp_processor_id();
1240
1241 /*
1242 * Some NUMA implementations (NUMAQ) don't initialize apicid to
1243 * node mapping during NUMA init. Now that logical apicid is
1244 * guaranteed to be known, give it another chance. This is already
1245 * a bit too late - percpu allocation has already happened without
1246 * proper NUMA affinity.
1247 */
1248 if (apic->x86_32_numa_cpu_node)
1249 set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
1250 apic->x86_32_numa_cpu_node(cpu));
1240#endif 1251#endif
1241 1252
1242 /* 1253 /*
@@ -1450,7 +1461,6 @@ int __init enable_IR(void)
1450void __init enable_IR_x2apic(void) 1461void __init enable_IR_x2apic(void)
1451{ 1462{
1452 unsigned long flags; 1463 unsigned long flags;
1453 struct IO_APIC_route_entry **ioapic_entries;
1454 int ret, x2apic_enabled = 0; 1464 int ret, x2apic_enabled = 0;
1455 int dmar_table_init_ret; 1465 int dmar_table_init_ret;
1456 1466
@@ -1458,13 +1468,7 @@ void __init enable_IR_x2apic(void)
1458 if (dmar_table_init_ret && !x2apic_supported()) 1468 if (dmar_table_init_ret && !x2apic_supported())
1459 return; 1469 return;
1460 1470
1461 ioapic_entries = alloc_ioapic_entries(); 1471 ret = save_ioapic_entries();
1462 if (!ioapic_entries) {
1463 pr_err("Allocate ioapic_entries failed\n");
1464 goto out;
1465 }
1466
1467 ret = save_IO_APIC_setup(ioapic_entries);
1468 if (ret) { 1472 if (ret) {
1469 pr_info("Saving IO-APIC state failed: %d\n", ret); 1473 pr_info("Saving IO-APIC state failed: %d\n", ret);
1470 goto out; 1474 goto out;
@@ -1472,7 +1476,7 @@ void __init enable_IR_x2apic(void)
1472 1476
1473 local_irq_save(flags); 1477 local_irq_save(flags);
1474 legacy_pic->mask_all(); 1478 legacy_pic->mask_all();
1475 mask_IO_APIC_setup(ioapic_entries); 1479 mask_ioapic_entries();
1476 1480
1477 if (dmar_table_init_ret) 1481 if (dmar_table_init_ret)
1478 ret = 0; 1482 ret = 0;
@@ -1503,14 +1507,11 @@ void __init enable_IR_x2apic(void)
1503 1507
1504nox2apic: 1508nox2apic:
1505 if (!ret) /* IR enabling failed */ 1509 if (!ret) /* IR enabling failed */
1506 restore_IO_APIC_setup(ioapic_entries); 1510 restore_ioapic_entries();
1507 legacy_pic->restore_mask(); 1511 legacy_pic->restore_mask();
1508 local_irq_restore(flags); 1512 local_irq_restore(flags);
1509 1513
1510out: 1514out:
1511 if (ioapic_entries)
1512 free_ioapic_entries(ioapic_entries);
1513
1514 if (x2apic_enabled) 1515 if (x2apic_enabled)
1515 return; 1516 return;
1516 1517
@@ -1812,30 +1813,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1812 */ 1813 */
1813void smp_error_interrupt(struct pt_regs *regs) 1814void smp_error_interrupt(struct pt_regs *regs)
1814{ 1815{
1815 u32 v, v1; 1816 u32 v0, v1;
1817 u32 i = 0;
1818 static const char * const error_interrupt_reason[] = {
1819 "Send CS error", /* APIC Error Bit 0 */
1820 "Receive CS error", /* APIC Error Bit 1 */
1821 "Send accept error", /* APIC Error Bit 2 */
1822 "Receive accept error", /* APIC Error Bit 3 */
1823 "Redirectable IPI", /* APIC Error Bit 4 */
1824 "Send illegal vector", /* APIC Error Bit 5 */
1825 "Received illegal vector", /* APIC Error Bit 6 */
1826 "Illegal register address", /* APIC Error Bit 7 */
1827 };
1816 1828
1817 exit_idle(); 1829 exit_idle();
1818 irq_enter(); 1830 irq_enter();
1819 /* First tickle the hardware, only then report what went on. -- REW */ 1831 /* First tickle the hardware, only then report what went on. -- REW */
1820 v = apic_read(APIC_ESR); 1832 v0 = apic_read(APIC_ESR);
1821 apic_write(APIC_ESR, 0); 1833 apic_write(APIC_ESR, 0);
1822 v1 = apic_read(APIC_ESR); 1834 v1 = apic_read(APIC_ESR);
1823 ack_APIC_irq(); 1835 ack_APIC_irq();
1824 atomic_inc(&irq_err_count); 1836 atomic_inc(&irq_err_count);
1825 1837
1826 /* 1838 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
1827 * Here is what the APIC error bits mean: 1839 smp_processor_id(), v0 , v1);
1828 * 0: Send CS error 1840
1829 * 1: Receive CS error 1841 v1 = v1 & 0xff;
1830 * 2: Send accept error 1842 while (v1) {
1831 * 3: Receive accept error 1843 if (v1 & 0x1)
1832 * 4: Reserved 1844 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1833 * 5: Send illegal vector 1845 i++;
1834 * 6: Received illegal vector 1846 v1 >>= 1;
1835 * 7: Illegal register address 1847 };
1836 */ 1848
1837 pr_debug("APIC error on CPU%d: %02x(%02x)\n", 1849 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1838 smp_processor_id(), v , v1); 1850
1839 irq_exit(); 1851 irq_exit();
1840} 1852}
1841 1853
@@ -2003,21 +2015,6 @@ void default_init_apic_ldr(void)
2003 apic_write(APIC_LDR, val); 2015 apic_write(APIC_LDR, val);
2004} 2016}
2005 2017
2006#ifdef CONFIG_X86_32
2007int default_x86_32_numa_cpu_node(int cpu)
2008{
2009#ifdef CONFIG_NUMA
2010 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2011
2012 if (apicid != BAD_APICID)
2013 return __apicid_to_node[apicid];
2014 return NUMA_NO_NODE;
2015#else
2016 return 0;
2017#endif
2018}
2019#endif
2020
2021/* 2018/*
2022 * Power management 2019 * Power management
2023 */ 2020 */
@@ -2088,28 +2085,20 @@ static void lapic_resume(void)
2088{ 2085{
2089 unsigned int l, h; 2086 unsigned int l, h;
2090 unsigned long flags; 2087 unsigned long flags;
2091 int maxlvt, ret; 2088 int maxlvt;
2092 struct IO_APIC_route_entry **ioapic_entries = NULL;
2093 2089
2094 if (!apic_pm_state.active) 2090 if (!apic_pm_state.active)
2095 return; 2091 return;
2096 2092
2097 local_irq_save(flags); 2093 local_irq_save(flags);
2098 if (intr_remapping_enabled) { 2094 if (intr_remapping_enabled) {
2099 ioapic_entries = alloc_ioapic_entries(); 2095 /*
2100 if (!ioapic_entries) { 2096 * IO-APIC and PIC have their own resume routines.
2101 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2097 * We just mask them here to make sure the interrupt
2102 goto restore; 2098 * subsystem is completely quiet while we enable x2apic
2103 } 2099 * and interrupt-remapping.
2104 2100 */
2105 ret = save_IO_APIC_setup(ioapic_entries); 2101 mask_ioapic_entries();
2106 if (ret) {
2107 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2108 free_ioapic_entries(ioapic_entries);
2109 goto restore;
2110 }
2111
2112 mask_IO_APIC_setup(ioapic_entries);
2113 legacy_pic->mask_all(); 2102 legacy_pic->mask_all();
2114 } 2103 }
2115 2104
@@ -2152,13 +2141,9 @@ static void lapic_resume(void)
2152 apic_write(APIC_ESR, 0); 2141 apic_write(APIC_ESR, 0);
2153 apic_read(APIC_ESR); 2142 apic_read(APIC_ESR);
2154 2143
2155 if (intr_remapping_enabled) { 2144 if (intr_remapping_enabled)
2156 reenable_intr_remapping(x2apic_mode); 2145 reenable_intr_remapping(x2apic_mode);
2157 legacy_pic->restore_mask(); 2146
2158 restore_IO_APIC_setup(ioapic_entries);
2159 free_ioapic_entries(ioapic_entries);
2160 }
2161restore:
2162 local_irq_restore(flags); 2147 local_irq_restore(flags);
2163} 2148}
2164 2149
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5652d31fe108..f7a41e4cae47 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/module.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
@@ -24,6 +25,12 @@
24#include <acpi/acpi_bus.h> 25#include <acpi/acpi_bus.h>
25#endif 26#endif
26 27
28static struct apic apic_physflat;
29static struct apic apic_flat;
30
31struct apic __read_mostly *apic = &apic_flat;
32EXPORT_SYMBOL_GPL(apic);
33
27static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 34static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
28{ 35{
29 return 1; 36 return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
164 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
165} 172}
166 173
167struct apic apic_flat = { 174static struct apic apic_flat = {
168 .name = "flat", 175 .name = "flat",
169 .probe = NULL, 176 .probe = NULL,
170 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 177 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
312 return per_cpu(x86_cpu_to_apicid, cpu); 319 return per_cpu(x86_cpu_to_apicid, cpu);
313} 320}
314 321
315struct apic apic_physflat = { 322static int physflat_probe(void)
323{
324 if (apic == &apic_physflat || num_possible_cpus() > 8)
325 return 1;
326
327 return 0;
328}
329
330static struct apic apic_physflat = {
316 331
317 .name = "physical flat", 332 .name = "physical flat",
318 .probe = NULL, 333 .probe = physflat_probe,
319 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 334 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
320 .apic_id_registered = flat_apic_id_registered, 335 .apic_id_registered = flat_apic_id_registered,
321 336
@@ -369,3 +384,8 @@ struct apic apic_physflat = {
369 .wait_icr_idle = native_apic_wait_icr_idle, 384 .wait_icr_idle = native_apic_wait_icr_idle,
370 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 385 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
371}; 386};
387
388/*
389 * We need to check for physflat first, so this order is important.
390 */
391apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087a..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
119 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
120} 120}
121 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
130struct apic apic_noop = { 122struct apic apic_noop = {
131 .name = "noop", 123 .name = "noop",
132 .probe = noop_probe, 124 .probe = noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
195 187
196#ifdef CONFIG_X86_32 188#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, 189 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif 190#endif
200}; 191};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e431659..efd737e827f4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -193,7 +193,7 @@ static int probe_bigsmp(void)
193 return dmi_bigsmp; 193 return dmi_bigsmp;
194} 194}
195 195
196struct apic apic_bigsmp = { 196static struct apic apic_bigsmp = {
197 197
198 .name = "bigsmp", 198 .name = "bigsmp",
199 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
@@ -253,5 +253,14 @@ struct apic apic_bigsmp = {
253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254 254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, 255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
257}; 256};
257
258struct apic * __init generic_bigsmp_probe(void)
259{
260 if (probe_bigsmp())
261 return &apic_bigsmp;
262
263 return NULL;
264}
265
266apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5b..9536b3fe43f8 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
511} 511}
512 512
513static int es7000_numa_cpu_node(int cpu)
514{
515 return 0;
516}
517
518static int es7000_cpu_present_to_apicid(int mps_cpu) 513static int es7000_cpu_present_to_apicid(int mps_cpu)
519{ 514{
520 if (!mps_cpu) 515 if (!mps_cpu)
@@ -625,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
625} 620}
626 621
627/* We've been warned by a false positive warning.Use __refdata to keep calm. */ 622/* We've been warned by a false positive warning.Use __refdata to keep calm. */
628struct apic __refdata apic_es7000_cluster = { 623static struct apic __refdata apic_es7000_cluster = {
629 624
630 .name = "es7000", 625 .name = "es7000",
631 .probe = probe_es7000, 626 .probe = probe_es7000,
@@ -688,10 +683,9 @@ struct apic __refdata apic_es7000_cluster = {
688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 683 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689 684
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 685 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
692}; 686};
693 687
694struct apic __refdata apic_es7000 = { 688static struct apic __refdata apic_es7000 = {
695 689
696 .name = "es7000", 690 .name = "es7000",
697 .probe = probe_es7000, 691 .probe = probe_es7000,
@@ -752,5 +746,10 @@ struct apic __refdata apic_es7000 = {
752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 746 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753 747
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 748 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
756}; 749};
750
751/*
752 * Need to check for es7000 followed by es7000_cluster, so this order
753 * in apic_drivers is important.
754 */
755apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb6..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20 20
21#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
22u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(int watchdog_thresh)
23{ 23{
24 return (u64)(cpu_khz) * 1000 * 60; 24 return (u64)(cpu_khz) * 1000 * watchdog_thresh;
25} 25}
26#endif 26#endif
27 27
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 68df09bba92e..e5293394b548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -76,17 +76,40 @@ int sis_apic_bug = -1;
76static DEFINE_RAW_SPINLOCK(ioapic_lock); 76static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_RAW_SPINLOCK(vector_lock); 77static DEFINE_RAW_SPINLOCK(vector_lock);
78 78
79/* 79static struct ioapic {
80 * # of IRQ routing registers 80 /*
81 */ 81 * # of IRQ routing registers
82int nr_ioapic_registers[MAX_IO_APICS]; 82 */
83 int nr_registers;
84 /*
85 * Saved state during suspend/resume, or while enabling intr-remap.
86 */
87 struct IO_APIC_route_entry *saved_registers;
88 /* I/O APIC config */
89 struct mpc_ioapic mp_config;
90 /* IO APIC gsi routing info */
91 struct mp_ioapic_gsi gsi_config;
92 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
93} ioapics[MAX_IO_APICS];
83 94
84/* I/O APIC entries */ 95#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
85struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; 96
86int nr_ioapics; 97int mpc_ioapic_id(int id)
98{
99 return ioapics[id].mp_config.apicid;
100}
87 101
88/* IO APIC gsi routing info */ 102unsigned int mpc_ioapic_addr(int id)
89struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 103{
104 return ioapics[id].mp_config.apicaddr;
105}
106
107struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
108{
109 return &ioapics[id].gsi_config;
110}
111
112int nr_ioapics;
90 113
91/* The one past the highest gsi number used */ 114/* The one past the highest gsi number used */
92u32 gsi_top; 115u32 gsi_top;
@@ -128,8 +151,8 @@ static int __init parse_noapic(char *str)
128} 151}
129early_param("noapic", parse_noapic); 152early_param("noapic", parse_noapic);
130 153
131static int io_apic_setup_irq_pin_once(unsigned int irq, int node, 154static int io_apic_setup_irq_pin(unsigned int irq, int node,
132 struct io_apic_irq_attr *attr); 155 struct io_apic_irq_attr *attr);
133 156
134/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ 157/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
135void mp_save_irq(struct mpc_intsrc *m) 158void mp_save_irq(struct mpc_intsrc *m)
@@ -179,6 +202,14 @@ int __init arch_early_irq_init(void)
179 io_apic_irqs = ~0UL; 202 io_apic_irqs = ~0UL;
180 } 203 }
181 204
205 for (i = 0; i < nr_ioapics; i++) {
206 ioapics[i].saved_registers =
207 kzalloc(sizeof(struct IO_APIC_route_entry) *
208 ioapics[i].nr_registers, GFP_KERNEL);
209 if (!ioapics[i].saved_registers)
210 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
211 }
212
182 cfg = irq_cfgx; 213 cfg = irq_cfgx;
183 count = ARRAY_SIZE(irq_cfgx); 214 count = ARRAY_SIZE(irq_cfgx);
184 node = cpu_to_node(0); 215 node = cpu_to_node(0);
@@ -297,7 +328,7 @@ struct io_apic {
297static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 328static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
298{ 329{
299 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 330 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
300 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 331 + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
301} 332}
302 333
303static inline void io_apic_eoi(unsigned int apic, unsigned int vector) 334static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -573,7 +604,7 @@ static void clear_IO_APIC (void)
573 int apic, pin; 604 int apic, pin;
574 605
575 for (apic = 0; apic < nr_ioapics; apic++) 606 for (apic = 0; apic < nr_ioapics; apic++)
576 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 607 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
577 clear_IO_APIC_pin(apic, pin); 608 clear_IO_APIC_pin(apic, pin);
578} 609}
579 610
@@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
615__setup("pirq=", ioapic_pirq_setup); 646__setup("pirq=", ioapic_pirq_setup);
616#endif /* CONFIG_X86_32 */ 647#endif /* CONFIG_X86_32 */
617 648
618struct IO_APIC_route_entry **alloc_ioapic_entries(void)
619{
620 int apic;
621 struct IO_APIC_route_entry **ioapic_entries;
622
623 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
624 GFP_KERNEL);
625 if (!ioapic_entries)
626 return 0;
627
628 for (apic = 0; apic < nr_ioapics; apic++) {
629 ioapic_entries[apic] =
630 kzalloc(sizeof(struct IO_APIC_route_entry) *
631 nr_ioapic_registers[apic], GFP_KERNEL);
632 if (!ioapic_entries[apic])
633 goto nomem;
634 }
635
636 return ioapic_entries;
637
638nomem:
639 while (--apic >= 0)
640 kfree(ioapic_entries[apic]);
641 kfree(ioapic_entries);
642
643 return 0;
644}
645
646/* 649/*
647 * Saves all the IO-APIC RTE's 650 * Saves all the IO-APIC RTE's
648 */ 651 */
649int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 652int save_ioapic_entries(void)
650{ 653{
651 int apic, pin; 654 int apic, pin;
652 655 int err = 0;
653 if (!ioapic_entries)
654 return -ENOMEM;
655 656
656 for (apic = 0; apic < nr_ioapics; apic++) { 657 for (apic = 0; apic < nr_ioapics; apic++) {
657 if (!ioapic_entries[apic]) 658 if (!ioapics[apic].saved_registers) {
658 return -ENOMEM; 659 err = -ENOMEM;
660 continue;
661 }
659 662
660 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 663 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
661 ioapic_entries[apic][pin] = 664 ioapics[apic].saved_registers[pin] =
662 ioapic_read_entry(apic, pin); 665 ioapic_read_entry(apic, pin);
663 } 666 }
664 667
665 return 0; 668 return err;
666} 669}
667 670
668/* 671/*
669 * Mask all IO APIC entries. 672 * Mask all IO APIC entries.
670 */ 673 */
671void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 674void mask_ioapic_entries(void)
672{ 675{
673 int apic, pin; 676 int apic, pin;
674 677
675 if (!ioapic_entries)
676 return;
677
678 for (apic = 0; apic < nr_ioapics; apic++) { 678 for (apic = 0; apic < nr_ioapics; apic++) {
679 if (!ioapic_entries[apic]) 679 if (!ioapics[apic].saved_registers)
680 break; 680 continue;
681 681
682 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 682 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
683 struct IO_APIC_route_entry entry; 683 struct IO_APIC_route_entry entry;
684 684
685 entry = ioapic_entries[apic][pin]; 685 entry = ioapics[apic].saved_registers[pin];
686 if (!entry.mask) { 686 if (!entry.mask) {
687 entry.mask = 1; 687 entry.mask = 1;
688 ioapic_write_entry(apic, pin, entry); 688 ioapic_write_entry(apic, pin, entry);
@@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
692} 692}
693 693
694/* 694/*
695 * Restore IO APIC entries which was saved in ioapic_entries. 695 * Restore IO APIC entries which was saved in the ioapic structure.
696 */ 696 */
697int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 697int restore_ioapic_entries(void)
698{ 698{
699 int apic, pin; 699 int apic, pin;
700 700
701 if (!ioapic_entries)
702 return -ENOMEM;
703
704 for (apic = 0; apic < nr_ioapics; apic++) { 701 for (apic = 0; apic < nr_ioapics; apic++) {
705 if (!ioapic_entries[apic]) 702 if (!ioapics[apic].saved_registers)
706 return -ENOMEM; 703 continue;
707 704
708 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 705 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
709 ioapic_write_entry(apic, pin, 706 ioapic_write_entry(apic, pin,
710 ioapic_entries[apic][pin]); 707 ioapics[apic].saved_registers[pin]);
711 } 708 }
712 return 0; 709 return 0;
713} 710}
714 711
715void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
716{
717 int apic;
718
719 for (apic = 0; apic < nr_ioapics; apic++)
720 kfree(ioapic_entries[apic]);
721
722 kfree(ioapic_entries);
723}
724
725/* 712/*
726 * Find the IRQ entry number of a certain pin. 713 * Find the IRQ entry number of a certain pin.
727 */ 714 */
@@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
731 718
732 for (i = 0; i < mp_irq_entries; i++) 719 for (i = 0; i < mp_irq_entries; i++)
733 if (mp_irqs[i].irqtype == type && 720 if (mp_irqs[i].irqtype == type &&
734 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || 721 (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
735 mp_irqs[i].dstapic == MP_APIC_ALL) && 722 mp_irqs[i].dstapic == MP_APIC_ALL) &&
736 mp_irqs[i].dstirq == pin) 723 mp_irqs[i].dstirq == pin)
737 return i; 724 return i;
@@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
773 if (i < mp_irq_entries) { 760 if (i < mp_irq_entries) {
774 int apic; 761 int apic;
775 for(apic = 0; apic < nr_ioapics; apic++) { 762 for(apic = 0; apic < nr_ioapics; apic++) {
776 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) 763 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
777 return apic; 764 return apic;
778 } 765 }
779 } 766 }
@@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin)
942{ 929{
943 int irq; 930 int irq;
944 int bus = mp_irqs[idx].srcbus; 931 int bus = mp_irqs[idx].srcbus;
932 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
945 933
946 /* 934 /*
947 * Debugging check, we are in big trouble if this message pops up! 935 * Debugging check, we are in big trouble if this message pops up!
@@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
952 if (test_bit(bus, mp_bus_not_pci)) { 940 if (test_bit(bus, mp_bus_not_pci)) {
953 irq = mp_irqs[idx].srcbusirq; 941 irq = mp_irqs[idx].srcbusirq;
954 } else { 942 } else {
955 u32 gsi = mp_gsi_routing[apic].gsi_base + pin; 943 u32 gsi = gsi_cfg->gsi_base + pin;
956 944
957 if (gsi >= NR_IRQS_LEGACY) 945 if (gsi >= NR_IRQS_LEGACY)
958 irq = gsi; 946 irq = gsi;
@@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1003 int lbus = mp_irqs[i].srcbus; 991 int lbus = mp_irqs[i].srcbus;
1004 992
1005 for (apic = 0; apic < nr_ioapics; apic++) 993 for (apic = 0; apic < nr_ioapics; apic++)
1006 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || 994 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
1007 mp_irqs[i].dstapic == MP_APIC_ALL) 995 mp_irqs[i].dstapic == MP_APIC_ALL)
1008 break; 996 break;
1009 997
@@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1222 int apic, idx, pin; 1210 int apic, idx, pin;
1223 1211
1224 for (apic = 0; apic < nr_ioapics; apic++) { 1212 for (apic = 0; apic < nr_ioapics; apic++) {
1225 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1213 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1226 idx = find_irq_entry(apic, pin, mp_INT); 1214 idx = find_irq_entry(apic, pin, mp_INT);
1227 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) 1215 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1228 return irq_trigger(idx); 1216 return irq_trigger(idx);
@@ -1350,14 +1338,14 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1350 apic_printk(APIC_VERBOSE,KERN_DEBUG 1338 apic_printk(APIC_VERBOSE,KERN_DEBUG
1351 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1352 "IRQ %d Mode:%i Active:%i)\n", 1340 "IRQ %d Mode:%i Active:%i)\n",
1353 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, 1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1354 irq, trigger, polarity); 1342 irq, trigger, polarity);
1355 1343
1356 1344
1357 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
1358 dest, trigger, polarity, cfg->vector, pin)) { 1346 dest, trigger, polarity, cfg->vector, pin)) {
1359 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1347 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1360 mp_ioapics[apic_id].apicid, pin); 1348 mpc_ioapic_id(apic_id), pin);
1361 __clear_irq_vector(irq, cfg); 1349 __clear_irq_vector(irq, cfg);
1362 return; 1350 return;
1363 } 1351 }
@@ -1369,17 +1357,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1369 ioapic_write_entry(apic_id, pin, entry); 1357 ioapic_write_entry(apic_id, pin, entry);
1370} 1358}
1371 1359
1372static struct {
1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1374} mp_ioapic_routing[MAX_IO_APICS];
1375
1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) 1360static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1377{ 1361{
1378 if (idx != -1) 1362 if (idx != -1)
1379 return false; 1363 return false;
1380 1364
1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", 1365 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin); 1366 mpc_ioapic_id(apic_id), pin);
1383 return true; 1367 return true;
1384} 1368}
1385 1369
@@ -1389,7 +1373,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
1389 struct io_apic_irq_attr attr; 1373 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq; 1374 unsigned int pin, irq;
1391 1375
1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1376 for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
1393 idx = find_irq_entry(apic_id, pin, mp_INT); 1377 idx = find_irq_entry(apic_id, pin, mp_INT);
1394 if (io_apic_pin_not_connected(idx, apic_id, pin)) 1378 if (io_apic_pin_not_connected(idx, apic_id, pin))
1395 continue; 1379 continue;
@@ -1511,7 +1495,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1511 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1495 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1512 for (i = 0; i < nr_ioapics; i++) 1496 for (i = 0; i < nr_ioapics; i++)
1513 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1497 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1514 mp_ioapics[i].apicid, nr_ioapic_registers[i]); 1498 mpc_ioapic_id(i), ioapics[i].nr_registers);
1515 1499
1516 /* 1500 /*
1517 * We are a bit conservative about what we expect. We have to 1501 * We are a bit conservative about what we expect. We have to
@@ -1531,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1531 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1515 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1532 1516
1533 printk("\n"); 1517 printk("\n");
1534 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1518 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
1535 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1519 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1536 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1520 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1537 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1521 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1825,7 +1809,7 @@ void __init enable_IO_APIC(void)
1825 for(apic = 0; apic < nr_ioapics; apic++) { 1809 for(apic = 0; apic < nr_ioapics; apic++) {
1826 int pin; 1810 int pin;
1827 /* See if any of the pins is in ExtINT mode */ 1811 /* See if any of the pins is in ExtINT mode */
1828 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1812 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1829 struct IO_APIC_route_entry entry; 1813 struct IO_APIC_route_entry entry;
1830 entry = ioapic_read_entry(apic, pin); 1814 entry = ioapic_read_entry(apic, pin);
1831 1815
@@ -1949,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1949 reg_00.raw = io_apic_read(apic_id, 0); 1933 reg_00.raw = io_apic_read(apic_id, 0);
1950 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1934 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1951 1935
1952 old_id = mp_ioapics[apic_id].apicid; 1936 old_id = mpc_ioapic_id(apic_id);
1953 1937
1954 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { 1938 if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
1955 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1939 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1956 apic_id, mp_ioapics[apic_id].apicid); 1940 apic_id, mpc_ioapic_id(apic_id));
1957 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1941 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1958 reg_00.bits.ID); 1942 reg_00.bits.ID);
1959 mp_ioapics[apic_id].apicid = reg_00.bits.ID; 1943 ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
1960 } 1944 }
1961 1945
1962 /* 1946 /*
@@ -1965,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1965 * 'stuck on smp_invalidate_needed IPI wait' messages. 1949 * 'stuck on smp_invalidate_needed IPI wait' messages.
1966 */ 1950 */
1967 if (apic->check_apicid_used(&phys_id_present_map, 1951 if (apic->check_apicid_used(&phys_id_present_map,
1968 mp_ioapics[apic_id].apicid)) { 1952 mpc_ioapic_id(apic_id))) {
1969 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1953 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1970 apic_id, mp_ioapics[apic_id].apicid); 1954 apic_id, mpc_ioapic_id(apic_id));
1971 for (i = 0; i < get_physical_broadcast(); i++) 1955 for (i = 0; i < get_physical_broadcast(); i++)
1972 if (!physid_isset(i, phys_id_present_map)) 1956 if (!physid_isset(i, phys_id_present_map))
1973 break; 1957 break;
@@ -1976,13 +1960,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1976 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1960 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1977 i); 1961 i);
1978 physid_set(i, phys_id_present_map); 1962 physid_set(i, phys_id_present_map);
1979 mp_ioapics[apic_id].apicid = i; 1963 ioapics[apic_id].mp_config.apicid = i;
1980 } else { 1964 } else {
1981 physid_mask_t tmp; 1965 physid_mask_t tmp;
1982 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); 1966 apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
1967 &tmp);
1983 apic_printk(APIC_VERBOSE, "Setting %d in the " 1968 apic_printk(APIC_VERBOSE, "Setting %d in the "
1984 "phys_id_present_map\n", 1969 "phys_id_present_map\n",
1985 mp_ioapics[apic_id].apicid); 1970 mpc_ioapic_id(apic_id));
1986 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1971 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1987 } 1972 }
1988 1973
@@ -1990,24 +1975,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1990 * We need to adjust the IRQ routing table 1975 * We need to adjust the IRQ routing table
1991 * if the ID changed. 1976 * if the ID changed.
1992 */ 1977 */
1993 if (old_id != mp_ioapics[apic_id].apicid) 1978 if (old_id != mpc_ioapic_id(apic_id))
1994 for (i = 0; i < mp_irq_entries; i++) 1979 for (i = 0; i < mp_irq_entries; i++)
1995 if (mp_irqs[i].dstapic == old_id) 1980 if (mp_irqs[i].dstapic == old_id)
1996 mp_irqs[i].dstapic 1981 mp_irqs[i].dstapic
1997 = mp_ioapics[apic_id].apicid; 1982 = mpc_ioapic_id(apic_id);
1998 1983
1999 /* 1984 /*
2000 * Update the ID register according to the right value 1985 * Update the ID register according to the right value
2001 * from the MPC table if they are different. 1986 * from the MPC table if they are different.
2002 */ 1987 */
2003 if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) 1988 if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
2004 continue; 1989 continue;
2005 1990
2006 apic_printk(APIC_VERBOSE, KERN_INFO 1991 apic_printk(APIC_VERBOSE, KERN_INFO
2007 "...changing IO-APIC physical APIC ID to %d ...", 1992 "...changing IO-APIC physical APIC ID to %d ...",
2008 mp_ioapics[apic_id].apicid); 1993 mpc_ioapic_id(apic_id));
2009 1994
2010 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 1995 reg_00.bits.ID = mpc_ioapic_id(apic_id);
2011 raw_spin_lock_irqsave(&ioapic_lock, flags); 1996 raw_spin_lock_irqsave(&ioapic_lock, flags);
2012 io_apic_write(apic_id, 0, reg_00.raw); 1997 io_apic_write(apic_id, 0, reg_00.raw);
2013 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1998 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2018,7 +2003,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2018 raw_spin_lock_irqsave(&ioapic_lock, flags); 2003 raw_spin_lock_irqsave(&ioapic_lock, flags);
2019 reg_00.raw = io_apic_read(apic_id, 0); 2004 reg_00.raw = io_apic_read(apic_id, 0);
2020 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2005 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2021 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2006 if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
2022 printk("could not set ID!\n"); 2007 printk("could not set ID!\n");
2023 else 2008 else
2024 apic_printk(APIC_VERBOSE, " ok.\n"); 2009 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2404,7 +2389,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2404 2389
2405 raw_spin_lock_irqsave(&ioapic_lock, flags); 2390 raw_spin_lock_irqsave(&ioapic_lock, flags);
2406 for_each_irq_pin(entry, cfg->irq_2_pin) { 2391 for_each_irq_pin(entry, cfg->irq_2_pin) {
2407 if (mp_ioapics[entry->apic].apicver >= 0x20) { 2392 if (mpc_ioapic_ver(entry->apic) >= 0x20) {
2408 /* 2393 /*
2409 * Intr-remapping uses pin number as the virtual vector 2394 * Intr-remapping uses pin number as the virtual vector
2410 * in the RTE. Actual vector is programmed in 2395 * in the RTE. Actual vector is programmed in
@@ -2918,49 +2903,19 @@ static int __init io_apic_bug_finalize(void)
2918 2903
2919late_initcall(io_apic_bug_finalize); 2904late_initcall(io_apic_bug_finalize);
2920 2905
2921static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; 2906static void resume_ioapic_id(int ioapic_id)
2922
2923static void suspend_ioapic(int ioapic_id)
2924{ 2907{
2925 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2926 int i;
2927
2928 if (!saved_data)
2929 return;
2930
2931 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2932 saved_data[i] = ioapic_read_entry(ioapic_id, i);
2933}
2934
2935static int ioapic_suspend(void)
2936{
2937 int ioapic_id;
2938
2939 for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
2940 suspend_ioapic(ioapic_id);
2941
2942 return 0;
2943}
2944
2945static void resume_ioapic(int ioapic_id)
2946{
2947 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2948 unsigned long flags; 2908 unsigned long flags;
2949 union IO_APIC_reg_00 reg_00; 2909 union IO_APIC_reg_00 reg_00;
2950 int i;
2951 2910
2952 if (!saved_data)
2953 return;
2954 2911
2955 raw_spin_lock_irqsave(&ioapic_lock, flags); 2912 raw_spin_lock_irqsave(&ioapic_lock, flags);
2956 reg_00.raw = io_apic_read(ioapic_id, 0); 2913 reg_00.raw = io_apic_read(ioapic_id, 0);
2957 if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { 2914 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
2958 reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; 2915 reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
2959 io_apic_write(ioapic_id, 0, reg_00.raw); 2916 io_apic_write(ioapic_id, 0, reg_00.raw);
2960 } 2917 }
2961 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2918 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2962 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2963 ioapic_write_entry(ioapic_id, i, saved_data[i]);
2964} 2919}
2965 2920
2966static void ioapic_resume(void) 2921static void ioapic_resume(void)
@@ -2968,28 +2923,18 @@ static void ioapic_resume(void)
2968 int ioapic_id; 2923 int ioapic_id;
2969 2924
2970 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) 2925 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2971 resume_ioapic(ioapic_id); 2926 resume_ioapic_id(ioapic_id);
2927
2928 restore_ioapic_entries();
2972} 2929}
2973 2930
2974static struct syscore_ops ioapic_syscore_ops = { 2931static struct syscore_ops ioapic_syscore_ops = {
2975 .suspend = ioapic_suspend, 2932 .suspend = save_ioapic_entries,
2976 .resume = ioapic_resume, 2933 .resume = ioapic_resume,
2977}; 2934};
2978 2935
2979static int __init ioapic_init_ops(void) 2936static int __init ioapic_init_ops(void)
2980{ 2937{
2981 int i;
2982
2983 for (i = 0; i < nr_ioapics; i++) {
2984 unsigned int size;
2985
2986 size = nr_ioapic_registers[i]
2987 * sizeof(struct IO_APIC_route_entry);
2988 ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
2989 if (!ioapic_saved_data[i])
2990 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
2991 }
2992
2993 register_syscore_ops(&ioapic_syscore_ops); 2938 register_syscore_ops(&ioapic_syscore_ops);
2994 2939
2995 return 0; 2940 return 0;
@@ -3570,7 +3515,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3570} 3515}
3571#endif /* CONFIG_HT_IRQ */ 3516#endif /* CONFIG_HT_IRQ */
3572 3517
3573int 3518static int
3574io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) 3519io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3575{ 3520{
3576 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); 3521 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
@@ -3585,21 +3530,21 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3585 return ret; 3530 return ret;
3586} 3531}
3587 3532
3588static int io_apic_setup_irq_pin_once(unsigned int irq, int node, 3533int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3589 struct io_apic_irq_attr *attr) 3534 struct io_apic_irq_attr *attr)
3590{ 3535{
3591 unsigned int id = attr->ioapic, pin = attr->ioapic_pin; 3536 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3592 int ret; 3537 int ret;
3593 3538
3594 /* Avoid redundant programming */ 3539 /* Avoid redundant programming */
3595 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { 3540 if (test_bit(pin, ioapics[id].pin_programmed)) {
3596 pr_debug("Pin %d-%d already programmed\n", 3541 pr_debug("Pin %d-%d already programmed\n",
3597 mp_ioapics[id].apicid, pin); 3542 mpc_ioapic_id(id), pin);
3598 return 0; 3543 return 0;
3599 } 3544 }
3600 ret = io_apic_setup_irq_pin(irq, node, attr); 3545 ret = io_apic_setup_irq_pin(irq, node, attr);
3601 if (!ret) 3546 if (!ret)
3602 set_bit(pin, mp_ioapic_routing[id].pin_programmed); 3547 set_bit(pin, ioapics[id].pin_programmed);
3603 return ret; 3548 return ret;
3604} 3549}
3605 3550
@@ -3764,8 +3709,7 @@ static u8 __init io_apic_unique_id(u8 id)
3764 3709
3765 bitmap_zero(used, 256); 3710 bitmap_zero(used, 256);
3766 for (i = 0; i < nr_ioapics; i++) { 3711 for (i = 0; i < nr_ioapics; i++) {
3767 struct mpc_ioapic *ia = &mp_ioapics[i]; 3712 __set_bit(mpc_ioapic_id(i), used);
3768 __set_bit(ia->apicid, used);
3769 } 3713 }
3770 if (!test_bit(id, used)) 3714 if (!test_bit(id, used))
3771 return id; 3715 return id;
@@ -3825,7 +3769,7 @@ void __init setup_ioapic_dest(void)
3825 return; 3769 return;
3826 3770
3827 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) 3771 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
3828 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 3772 for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
3829 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 3773 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
3830 if (irq_entry == -1) 3774 if (irq_entry == -1)
3831 continue; 3775 continue;
@@ -3896,7 +3840,7 @@ void __init ioapic_and_gsi_init(void)
3896 ioapic_res = ioapic_setup_resources(nr_ioapics); 3840 ioapic_res = ioapic_setup_resources(nr_ioapics);
3897 for (i = 0; i < nr_ioapics; i++) { 3841 for (i = 0; i < nr_ioapics; i++) {
3898 if (smp_found_config) { 3842 if (smp_found_config) {
3899 ioapic_phys = mp_ioapics[i].apicaddr; 3843 ioapic_phys = mpc_ioapic_addr(i);
3900#ifdef CONFIG_X86_32 3844#ifdef CONFIG_X86_32
3901 if (!ioapic_phys) { 3845 if (!ioapic_phys) {
3902 printk(KERN_ERR 3846 printk(KERN_ERR
@@ -3956,8 +3900,9 @@ int mp_find_ioapic(u32 gsi)
3956 3900
3957 /* Find the IOAPIC that manages this GSI. */ 3901 /* Find the IOAPIC that manages this GSI. */
3958 for (i = 0; i < nr_ioapics; i++) { 3902 for (i = 0; i < nr_ioapics; i++) {
3959 if ((gsi >= mp_gsi_routing[i].gsi_base) 3903 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
3960 && (gsi <= mp_gsi_routing[i].gsi_end)) 3904 if ((gsi >= gsi_cfg->gsi_base)
3905 && (gsi <= gsi_cfg->gsi_end))
3961 return i; 3906 return i;
3962 } 3907 }
3963 3908
@@ -3967,12 +3912,16 @@ int mp_find_ioapic(u32 gsi)
3967 3912
3968int mp_find_ioapic_pin(int ioapic, u32 gsi) 3913int mp_find_ioapic_pin(int ioapic, u32 gsi)
3969{ 3914{
3915 struct mp_ioapic_gsi *gsi_cfg;
3916
3970 if (WARN_ON(ioapic == -1)) 3917 if (WARN_ON(ioapic == -1))
3971 return -1; 3918 return -1;
3972 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) 3919
3920 gsi_cfg = mp_ioapic_gsi_routing(ioapic);
3921 if (WARN_ON(gsi > gsi_cfg->gsi_end))
3973 return -1; 3922 return -1;
3974 3923
3975 return gsi - mp_gsi_routing[ioapic].gsi_base; 3924 return gsi - gsi_cfg->gsi_base;
3976} 3925}
3977 3926
3978static __init int bad_ioapic(unsigned long address) 3927static __init int bad_ioapic(unsigned long address)
@@ -3994,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3994{ 3943{
3995 int idx = 0; 3944 int idx = 0;
3996 int entries; 3945 int entries;
3946 struct mp_ioapic_gsi *gsi_cfg;
3997 3947
3998 if (bad_ioapic(address)) 3948 if (bad_ioapic(address))
3999 return; 3949 return;
4000 3950
4001 idx = nr_ioapics; 3951 idx = nr_ioapics;
4002 3952
4003 mp_ioapics[idx].type = MP_IOAPIC; 3953 ioapics[idx].mp_config.type = MP_IOAPIC;
4004 mp_ioapics[idx].flags = MPC_APIC_USABLE; 3954 ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
4005 mp_ioapics[idx].apicaddr = address; 3955 ioapics[idx].mp_config.apicaddr = address;
4006 3956
4007 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 3957 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4008 mp_ioapics[idx].apicid = io_apic_unique_id(id); 3958 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
4009 mp_ioapics[idx].apicver = io_apic_get_version(idx); 3959 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4010 3960
4011 /* 3961 /*
4012 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 3962 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4013 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 3963 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4014 */ 3964 */
4015 entries = io_apic_get_redir_entries(idx); 3965 entries = io_apic_get_redir_entries(idx);
4016 mp_gsi_routing[idx].gsi_base = gsi_base; 3966 gsi_cfg = mp_ioapic_gsi_routing(idx);
4017 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; 3967 gsi_cfg->gsi_base = gsi_base;
3968 gsi_cfg->gsi_end = gsi_base + entries - 1;
4018 3969
4019 /* 3970 /*
4020 * The number of IO-APIC IRQ registers (== #pins): 3971 * The number of IO-APIC IRQ registers (== #pins):
4021 */ 3972 */
4022 nr_ioapic_registers[idx] = entries; 3973 ioapics[idx].nr_registers = entries;
4023 3974
4024 if (mp_gsi_routing[idx].gsi_end >= gsi_top) 3975 if (gsi_cfg->gsi_end >= gsi_top)
4025 gsi_top = mp_gsi_routing[idx].gsi_end + 1; 3976 gsi_top = gsi_cfg->gsi_end + 1;
4026 3977
4027 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 3978 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4028 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 3979 "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
4029 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, 3980 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4030 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); 3981 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4031 3982
4032 nr_ioapics++; 3983 nr_ioapics++;
4033} 3984}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134b..c4a61ca1349a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
48#include <asm/e820.h> 48#include <asm/e820.h>
49#include <asm/ipi.h> 49#include <asm/ipi.h>
50 50
51#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
52
53int found_numaq; 51int found_numaq;
54 52
55/* 53/*
@@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
79static inline void numaq_register_node(int node, struct sys_cfg_data *scd) 77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
80{ 78{
81 struct eachquadmem *eq = scd->eq + node; 79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
82 83
83 node_set_online(node); 84 node_set(node, numa_nodes_parsed);
84 85 ret = numa_add_memblk(node, start, end);
85 /* Convert to pages */ 86 BUG_ON(ret < 0);
86 node_start_pfn[node] =
87 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
88
89 node_end_pfn[node] =
90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
91
92 memblock_x86_register_active_regions(node, node_start_pfn[node],
93 node_end_pfn[node]);
94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100} 87}
101 88
102/* 89/*
103 * Function: smp_dump_qct() 90 * Function: smp_dump_qct()
104 * 91 *
105 * Description: gets memory layout from the quad config table. This 92 * Description: gets memory layout from the quad config table. This
106 * function also updates node_online_map with the nodes (quads) present. 93 * function also updates numa_nodes_parsed with the nodes (quads) present.
107 */ 94 */
108static void __init smp_dump_qct(void) 95static void __init smp_dump_qct(void)
109{ 96{
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
112 99
113 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); 100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
114 101
115 nodes_clear(node_online_map);
116 for_each_node(node) { 102 for_each_node(node) {
117 if (scd->quads_present31_0 & (1 << node)) 103 if (scd->quads_present31_0 & (1 << node))
118 numaq_register_node(node, scd); 104 numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
282 } 268 }
283} 269}
284 270
285int __init get_memcfg_numaq(void) 271int __init numaq_numa_init(void)
286{ 272{
287 early_check_numaq(); 273 early_check_numaq();
288 if (!found_numaq) 274 if (!found_numaq)
289 return 0; 275 return -ENOENT;
290 smp_dump_qct(); 276 smp_dump_qct();
291 277
292 return 1; 278 return 0;
293} 279}
294 280
295#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -486,8 +472,8 @@ static void numaq_setup_portio_remap(void)
486 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 472 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
487} 473}
488 474
489/* Use __refdata to keep false positive warning calm. */ 475/* Use __refdata to keep false positive warning calm. */
490struct apic __refdata apic_numaq = { 476static struct apic __refdata apic_numaq = {
491 477
492 .name = "NUMAQ", 478 .name = "NUMAQ",
493 .probe = probe_numaq, 479 .probe = probe_numaq,
@@ -551,3 +537,5 @@ struct apic __refdata apic_numaq = {
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, 537 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node, 538 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
553}; 539};
540
541apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b61108..b5254ad044ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,31 +52,6 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static int default_x86_32_early_logical_apicid(int cpu) 55static int default_x86_32_early_logical_apicid(int cpu)
81{ 56{
82 return 1 << cpu; 57 return 1 << cpu;
@@ -112,7 +87,7 @@ static int probe_default(void)
112 return 1; 87 return 1;
113} 88}
114 89
115struct apic apic_default = { 90static struct apic apic_default = {
116 91
117 .name = "default", 92 .name = "default",
118 .probe = probe_default, 93 .probe = probe_default,
@@ -172,47 +147,24 @@ struct apic apic_default = {
172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 147 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173 148
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, 149 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
176}; 150};
177 151
178extern struct apic apic_numaq; 152apic_driver(apic_default);
179extern struct apic apic_summit;
180extern struct apic apic_bigsmp;
181extern struct apic apic_es7000;
182extern struct apic apic_es7000_cluster;
183 153
184struct apic *apic = &apic_default; 154struct apic *apic = &apic_default;
185EXPORT_SYMBOL_GPL(apic); 155EXPORT_SYMBOL_GPL(apic);
186 156
187static struct apic *apic_probe[] __initdata = {
188#ifdef CONFIG_X86_NUMAQ
189 &apic_numaq,
190#endif
191#ifdef CONFIG_X86_SUMMIT
192 &apic_summit,
193#endif
194#ifdef CONFIG_X86_BIGSMP
195 &apic_bigsmp,
196#endif
197#ifdef CONFIG_X86_ES7000
198 &apic_es7000,
199 &apic_es7000_cluster,
200#endif
201 &apic_default, /* must be last */
202 NULL,
203};
204
205static int cmdline_apic __initdata; 157static int cmdline_apic __initdata;
206static int __init parse_apic(char *arg) 158static int __init parse_apic(char *arg)
207{ 159{
208 int i; 160 struct apic **drv;
209 161
210 if (!arg) 162 if (!arg)
211 return -EINVAL; 163 return -EINVAL;
212 164
213 for (i = 0; apic_probe[i]; i++) { 165 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
214 if (!strcmp(apic_probe[i]->name, arg)) { 166 if (!strcmp((*drv)->name, arg)) {
215 apic = apic_probe[i]; 167 apic = *drv;
216 cmdline_apic = 1; 168 cmdline_apic = 1;
217 return 0; 169 return 0;
218 } 170 }
@@ -223,38 +175,58 @@ static int __init parse_apic(char *arg)
223} 175}
224early_param("apic", parse_apic); 176early_param("apic", parse_apic);
225 177
226void __init generic_bigsmp_probe(void) 178void __init default_setup_apic_routing(void)
227{ 179{
180 int version = apic_version[boot_cpu_physical_apicid];
181
182 if (num_possible_cpus() > 8) {
183 switch (boot_cpu_data.x86_vendor) {
184 case X86_VENDOR_INTEL:
185 if (!APIC_XAPIC(version)) {
186 def_to_bigsmp = 0;
187 break;
188 }
189 /* If P4 and above fall through */
190 case X86_VENDOR_AMD:
191 def_to_bigsmp = 1;
192 }
193 }
194
228#ifdef CONFIG_X86_BIGSMP 195#ifdef CONFIG_X86_BIGSMP
229 /* 196 /*
230 * This routine is used to switch to bigsmp mode when 197 * This is used to switch to bigsmp mode when
231 * - There is no apic= option specified by the user 198 * - There is no apic= option specified by the user
232 * - generic_apic_probe() has chosen apic_default as the sub_arch 199 * - generic_apic_probe() has chosen apic_default as the sub_arch
233 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
234 */ 201 */
235 202
236 if (!cmdline_apic && apic == &apic_default) { 203 if (!cmdline_apic && apic == &apic_default) {
237 if (apic_bigsmp.probe()) { 204 struct apic *bigsmp = generic_bigsmp_probe();
238 apic = &apic_bigsmp; 205 if (bigsmp) {
206 apic = bigsmp;
239 printk(KERN_INFO "Overriding APIC driver with %s\n", 207 printk(KERN_INFO "Overriding APIC driver with %s\n",
240 apic->name); 208 apic->name);
241 } 209 }
242 } 210 }
243#endif 211#endif
212
213 if (apic->setup_apic_routing)
214 apic->setup_apic_routing();
244} 215}
245 216
246void __init generic_apic_probe(void) 217void __init generic_apic_probe(void)
247{ 218{
248 if (!cmdline_apic) { 219 if (!cmdline_apic) {
249 int i; 220 struct apic **drv;
250 for (i = 0; apic_probe[i]; i++) { 221
251 if (apic_probe[i]->probe()) { 222 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
252 apic = apic_probe[i]; 223 if ((*drv)->probe()) {
224 apic = *drv;
253 break; 225 break;
254 } 226 }
255 } 227 }
256 /* Not visible without early console */ 228 /* Not visible without early console */
257 if (!apic_probe[i]) 229 if (drv == __apicdrivers_end)
258 panic("Didn't find an APIC driver"); 230 panic("Didn't find an APIC driver");
259 } 231 }
260 printk(KERN_INFO "Using APIC driver %s\n", apic->name); 232 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -265,16 +237,16 @@ void __init generic_apic_probe(void)
265int __init 237int __init
266generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) 238generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
267{ 239{
268 int i; 240 struct apic **drv;
269 241
270 for (i = 0; apic_probe[i]; ++i) { 242 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
271 if (!apic_probe[i]->mps_oem_check) 243 if (!((*drv)->mps_oem_check))
272 continue; 244 continue;
273 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) 245 if (!(*drv)->mps_oem_check(mpc, oem, productid))
274 continue; 246 continue;
275 247
276 if (!cmdline_apic) { 248 if (!cmdline_apic) {
277 apic = apic_probe[i]; 249 apic = *drv;
278 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 250 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
279 apic->name); 251 apic->name);
280 } 252 }
@@ -285,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
285 257
286int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 258int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
287{ 259{
288 int i; 260 struct apic **drv;
289 261
290 for (i = 0; apic_probe[i]; ++i) { 262 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
291 if (!apic_probe[i]->acpi_madt_oem_check) 263 if (!(*drv)->acpi_madt_oem_check)
292 continue; 264 continue;
293 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) 265 if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
294 continue; 266 continue;
295 267
296 if (!cmdline_apic) { 268 if (!cmdline_apic) {
297 apic = apic_probe[i]; 269 apic = *drv;
298 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 270 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
299 apic->name); 271 apic->name);
300 } 272 }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index d8c4a6feb286..3fe986698929 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26extern struct apic apic_flat;
27extern struct apic apic_physflat;
28extern struct apic apic_x2xpic_uv_x;
29extern struct apic apic_x2apic_phys;
30extern struct apic apic_x2apic_cluster;
31
32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
34
35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
40 &apic_x2apic_phys,
41 &apic_x2apic_cluster,
42#endif
43 &apic_physflat,
44 NULL,
45};
46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) 26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{ 27{
49 return hard_smp_processor_id() >> index_msb; 28 return hard_smp_processor_id() >> index_msb;
@@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
54 */ 33 */
55void __init default_setup_apic_routing(void) 34void __init default_setup_apic_routing(void)
56{ 35{
36 struct apic **drv;
57 37
58 enable_IR_x2apic(); 38 enable_IR_x2apic();
59 39
60#ifdef CONFIG_X86_X2APIC 40 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
61 if (x2apic_mode 41 if ((*drv)->probe && (*drv)->probe()) {
62#ifdef CONFIG_X86_UV 42 if (apic != *drv) {
63 && apic != &apic_x2apic_uv_x 43 apic = *drv;
64#endif 44 pr_info("Switched APIC routing to %s.\n",
65 ) { 45 apic->name);
66 if (x2apic_phys) 46 }
67 apic = &apic_x2apic_phys; 47 break;
68 else 48 }
69 apic = &apic_x2apic_cluster;
70 } 49 }
71#endif
72
73 if (apic == &apic_flat && num_possible_cpus() > 8)
74 apic = &apic_physflat;
75
76 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
77 50
78 if (is_vsmp_box()) { 51 if (is_vsmp_box()) {
79 /* need to update phys_pkg_id */ 52 /* need to update phys_pkg_id */
@@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector)
90 63
91int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 64int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
92{ 65{
93 int i; 66 struct apic **drv;
94 67
95 for (i = 0; apic_probe[i]; ++i) { 68 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
96 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 69 if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
97 apic = apic_probe[i]; 70 if (apic != *drv) {
98 printk(KERN_INFO "Setting APIC routing to %s.\n", 71 apic = *drv;
99 apic->name); 72 pr_info("Setting APIC routing to %s.\n",
73 apic->name);
74 }
100 return 1; 75 return 1;
101 } 76 }
102 } 77 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414a..19114423c58c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -491,7 +491,7 @@ void setup_summit(void)
491} 491}
492#endif 492#endif
493 493
494struct apic apic_summit = { 494static struct apic apic_summit = {
495 495
496 .name = "summit", 496 .name = "summit",
497 .probe = probe_summit, 497 .probe = probe_summit,
@@ -551,5 +551,6 @@ struct apic apic_summit = {
551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552 552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid, 553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
555}; 554};
555
556apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 90949bbd566d..500795875827 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8#include <linux/cpu.h>
8 9
9#include <asm/smp.h> 10#include <asm/smp.h>
10#include <asm/apic.h> 11#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 12
13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
15static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
14 16
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 17static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 18{
17 return x2apic_enabled(); 19 return x2apic_enabled();
18} 20}
19 21
20/* 22static inline u32 x2apic_cluster(int cpu)
21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
24static const struct cpumask *x2apic_target_cpus(void)
25{ 23{
26 return cpu_online_mask; 24 return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
27}
28
29/*
30 * for now each logical cpu is in its own vector allocation domain.
31 */
32static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
33{
34 cpumask_clear(retmask);
35 cpumask_set_cpu(cpu, retmask);
36} 25}
37 26
38static void 27static void
39 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 28__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
40{ 29{
41 unsigned long cfg; 30 struct cpumask *cpus_in_cluster_ptr;
31 struct cpumask *ipi_mask_ptr;
32 unsigned int cpu, this_cpu;
33 unsigned long flags;
34 u32 dest;
35
36 x2apic_wrmsr_fence();
37
38 local_irq_save(flags);
42 39
43 cfg = __prepare_ICR(0, vector, dest); 40 this_cpu = smp_processor_id();
44 41
45 /* 42 /*
46 * send the IPI. 43 * We are to modify mask, so we need an own copy
44 * and be sure it's manipulated with irq off.
47 */ 45 */
48 native_x2apic_icr_write(cfg, apicid); 46 ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
49} 47 cpumask_copy(ipi_mask_ptr, mask);
50 48
51/* 49 /*
52 * for now, we send the IPI's one by one in the cpumask. 50 * The idea is to send one IPI per cluster.
53 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group 51 */
54 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 52 for_each_cpu(cpu, ipi_mask_ptr) {
55 * writes. 53 unsigned long i;
56 */
57static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{
59 unsigned long query_cpu;
60 unsigned long flags;
61 54
62 x2apic_wrmsr_fence(); 55 cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
56 dest = 0;
63 57
64 local_irq_save(flags); 58 /* Collect cpus in cluster. */
65 for_each_cpu(query_cpu, mask) { 59 for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
66 __x2apic_send_IPI_dest( 60 if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
67 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 61 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
68 vector, apic->dest_logical); 62 }
63
64 if (!dest)
65 continue;
66
67 __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
68 /*
69 * Cluster sibling cpus should be discared now so
70 * we would not send IPI them second time.
71 */
72 cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
69 } 73 }
74
70 local_irq_restore(flags); 75 local_irq_restore(flags);
71} 76}
72 77
78static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
79{
80 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
81}
82
73static void 83static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 85{
76 unsigned long this_cpu = smp_processor_id(); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu == this_cpu)
85 continue;
86 __x2apic_send_IPI_dest(
87 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
88 vector, apic->dest_logical);
89 }
90 local_irq_restore(flags);
91} 87}
92 88
93static void x2apic_send_IPI_allbutself(int vector) 89static void x2apic_send_IPI_allbutself(int vector)
94{ 90{
95 unsigned long this_cpu = smp_processor_id(); 91 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
96 unsigned long query_cpu;
97 unsigned long flags;
98
99 x2apic_wrmsr_fence();
100
101 local_irq_save(flags);
102 for_each_online_cpu(query_cpu) {
103 if (query_cpu == this_cpu)
104 continue;
105 __x2apic_send_IPI_dest(
106 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
107 vector, apic->dest_logical);
108 }
109 local_irq_restore(flags);
110} 92}
111 93
112static void x2apic_send_IPI_all(int vector) 94static void x2apic_send_IPI_all(int vector)
113{ 95{
114 x2apic_send_IPI_mask(cpu_online_mask, vector); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
115}
116
117static int x2apic_apic_id_registered(void)
118{
119 return 1;
120} 97}
121 98
122static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
151 return per_cpu(x86_cpu_to_logical_apicid, cpu); 128 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152} 129}
153 130
154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 131static void init_x2apic_ldr(void)
155{ 132{
156 unsigned int id; 133 unsigned int this_cpu = smp_processor_id();
134 unsigned int cpu;
157 135
158 id = x; 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
159 return id; 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 }
160} 145}
161 146
162static unsigned long set_apic_id(unsigned int id) 147 /*
148 * At CPU state changes, update the x2apic cluster sibling info.
149 */
150static int __cpuinit
151update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
163{ 152{
164 unsigned long x; 153 unsigned int this_cpu = (unsigned long)hcpu;
154 unsigned int cpu;
155 int err = 0;
156
157 switch (action) {
158 case CPU_UP_PREPARE:
159 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
160 GFP_KERNEL)) {
161 err = -ENOMEM;
162 } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
163 GFP_KERNEL)) {
164 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
165 err = -ENOMEM;
166 }
167 break;
168 case CPU_UP_CANCELED:
169 case CPU_UP_CANCELED_FROZEN:
170 case CPU_DEAD:
171 for_each_online_cpu(cpu) {
172 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
173 continue;
174 __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
175 __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
176 }
177 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
178 free_cpumask_var(per_cpu(ipi_mask, this_cpu));
179 break;
180 }
165 181
166 x = id; 182 return notifier_from_errno(err);
167 return x;
168} 183}
169 184
170static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 185static struct notifier_block __refdata x2apic_cpu_notifier = {
171{ 186 .notifier_call = update_clusterinfo,
172 return initial_apicid >> index_msb; 187};
173}
174 188
175static void x2apic_send_IPI_self(int vector) 189static int x2apic_init_cpu_notifier(void)
176{ 190{
177 apic_write(APIC_SELF_IPI, vector); 191 int cpu = smp_processor_id();
192
193 zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
194 zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1;
178} 201}
179 202
180static void init_x2apic_ldr(void) 203static int x2apic_cluster_probe(void)
181{ 204{
182 int cpu = smp_processor_id(); 205 if (x2apic_mode)
183 206 return x2apic_init_cpu_notifier();
184 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 207 else
208 return 0;
185} 209}
186 210
187struct apic apic_x2apic_cluster = { 211static struct apic apic_x2apic_cluster = {
188 212
189 .name = "cluster x2apic", 213 .name = "cluster x2apic",
190 .probe = NULL, 214 .probe = x2apic_cluster_probe,
191 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
192 .apic_id_registered = x2apic_apic_id_registered, 216 .apic_id_registered = x2apic_apic_id_registered,
193 217
@@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = {
211 .setup_portio_remap = NULL, 235 .setup_portio_remap = NULL,
212 .check_phys_apicid_present = default_check_phys_apicid_present, 236 .check_phys_apicid_present = default_check_phys_apicid_present,
213 .enable_apic_mode = NULL, 237 .enable_apic_mode = NULL,
214 .phys_pkg_id = x2apic_cluster_phys_pkg_id, 238 .phys_pkg_id = x2apic_phys_pkg_id,
215 .mps_oem_check = NULL, 239 .mps_oem_check = NULL,
216 240
217 .get_apic_id = x2apic_cluster_phys_get_apic_id, 241 .get_apic_id = x2apic_get_apic_id,
218 .set_apic_id = set_apic_id, 242 .set_apic_id = x2apic_set_apic_id,
219 .apic_id_mask = 0xFFFFFFFFu, 243 .apic_id_mask = 0xFFFFFFFFu,
220 244
221 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = {
240 .wait_icr_idle = native_x2apic_wait_icr_idle, 264 .wait_icr_idle = native_x2apic_wait_icr_idle,
241 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 265 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
242}; 266};
267
268apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c7e6d6645bf4..f5373dfde21e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h> 10#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 11
13int x2apic_phys; 12int x2apic_phys;
14 13
14static struct apic apic_x2apic_phys;
15
15static int set_x2apic_phys_mode(char *arg) 16static int set_x2apic_phys_mode(char *arg)
16{ 17{
17 x2apic_phys = 1; 18 x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 28 return 0;
28} 29}
29 30
30/* 31static void
31 * need to use more than cpu 0, because we need more vectors when 32__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
32 * MSI-X are used.
33 */
34static const struct cpumask *x2apic_target_cpus(void)
35{
36 return cpu_online_mask;
37}
38
39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
40{
41 cpumask_clear(retmask);
42 cpumask_set_cpu(cpu, retmask);
43}
44
45static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
46 unsigned int dest)
47{
48 unsigned long cfg;
49
50 cfg = __prepare_ICR(0, vector, dest);
51
52 /*
53 * send the IPI.
54 */
55 native_x2apic_icr_write(cfg, apicid);
56}
57
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{ 33{
60 unsigned long query_cpu; 34 unsigned long query_cpu;
35 unsigned long this_cpu;
61 unsigned long flags; 36 unsigned long flags;
62 37
63 x2apic_wrmsr_fence(); 38 x2apic_wrmsr_fence();
64 39
65 local_irq_save(flags); 40 local_irq_save(flags);
41
42 this_cpu = smp_processor_id();
66 for_each_cpu(query_cpu, mask) { 43 for_each_cpu(query_cpu, mask) {
44 if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
45 continue;
67 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 46 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
68 vector, APIC_DEST_PHYSICAL); 47 vector, APIC_DEST_PHYSICAL);
69 } 48 }
70 local_irq_restore(flags); 49 local_irq_restore(flags);
71} 50}
72 51
52static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
53{
54 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
55}
56
73static void 57static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 58 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 59{
76 unsigned long this_cpu = smp_processor_id(); 60 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu != this_cpu)
85 __x2apic_send_IPI_dest(
86 per_cpu(x86_cpu_to_apicid, query_cpu),
87 vector, APIC_DEST_PHYSICAL);
88 }
89 local_irq_restore(flags);
90} 61}
91 62
92static void x2apic_send_IPI_allbutself(int vector) 63static void x2apic_send_IPI_allbutself(int vector)
93{ 64{
94 unsigned long this_cpu = smp_processor_id(); 65 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
95 unsigned long query_cpu;
96 unsigned long flags;
97
98 x2apic_wrmsr_fence();
99
100 local_irq_save(flags);
101 for_each_online_cpu(query_cpu) {
102 if (query_cpu == this_cpu)
103 continue;
104 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
105 vector, APIC_DEST_PHYSICAL);
106 }
107 local_irq_restore(flags);
108} 66}
109 67
110static void x2apic_send_IPI_all(int vector) 68static void x2apic_send_IPI_all(int vector)
111{ 69{
112 x2apic_send_IPI_mask(cpu_online_mask, vector); 70 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
113}
114
115static int x2apic_apic_id_registered(void)
116{
117 return 1;
118} 71}
119 72
120static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 73static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
149 return per_cpu(x86_cpu_to_apicid, cpu); 102 return per_cpu(x86_cpu_to_apicid, cpu);
150} 103}
151 104
152static unsigned int x2apic_phys_get_apic_id(unsigned long x) 105static void init_x2apic_ldr(void)
153{
154 return x;
155}
156
157static unsigned long set_apic_id(unsigned int id)
158{
159 return id;
160}
161
162static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
163{ 106{
164 return initial_apicid >> index_msb;
165} 107}
166 108
167static void x2apic_send_IPI_self(int vector) 109static int x2apic_phys_probe(void)
168{ 110{
169 apic_write(APIC_SELF_IPI, vector); 111 if (x2apic_mode && x2apic_phys)
170} 112 return 1;
171 113
172static void init_x2apic_ldr(void) 114 return apic == &apic_x2apic_phys;
173{
174} 115}
175 116
176struct apic apic_x2apic_phys = { 117static struct apic apic_x2apic_phys = {
177 118
178 .name = "physical x2apic", 119 .name = "physical x2apic",
179 .probe = NULL, 120 .probe = x2apic_phys_probe,
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
181 .apic_id_registered = x2apic_apic_id_registered, 122 .apic_id_registered = x2apic_apic_id_registered,
182 123
@@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = {
203 .phys_pkg_id = x2apic_phys_pkg_id, 144 .phys_pkg_id = x2apic_phys_pkg_id,
204 .mps_oem_check = NULL, 145 .mps_oem_check = NULL,
205 146
206 .get_apic_id = x2apic_phys_get_apic_id, 147 .get_apic_id = x2apic_get_apic_id,
207 .set_apic_id = set_apic_id, 148 .set_apic_id = x2apic_set_apic_id,
208 .apic_id_mask = 0xFFFFFFFFu, 149 .apic_id_mask = 0xFFFFFFFFu,
209 150
210 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 151 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = {
229 .wait_icr_idle = native_x2apic_wait_icr_idle, 170 .wait_icr_idle = native_x2apic_wait_icr_idle,
230 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 171 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
231}; 172};
173
174apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 33b10a0fc095..f450b683dfcf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -37,6 +37,13 @@
37#include <asm/smp.h> 37#include <asm/smp.h>
38#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h> 39#include <asm/emergency-restart.h>
40#include <asm/nmi.h>
41
42/* BMC sets a bit this MMR non-zero before sending an NMI */
43#define UVH_NMI_MMR UVH_SCRATCH5
44#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
45#define UV_NMI_PENDING_MASK (1UL << 63)
46DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
40 47
41DEFINE_PER_CPU(int, x2apic_extra_bits); 48DEFINE_PER_CPU(int, x2apic_extra_bits);
42 49
@@ -51,6 +58,8 @@ unsigned int uv_apicid_hibits;
51EXPORT_SYMBOL_GPL(uv_apicid_hibits); 58EXPORT_SYMBOL_GPL(uv_apicid_hibits);
52static DEFINE_SPINLOCK(uv_nmi_lock); 59static DEFINE_SPINLOCK(uv_nmi_lock);
53 60
61static struct apic apic_x2apic_uv_x;
62
54static unsigned long __init uv_early_read_mmr(unsigned long addr) 63static unsigned long __init uv_early_read_mmr(unsigned long addr)
55{ 64{
56 unsigned long val, *mmr; 65 unsigned long val, *mmr;
@@ -319,10 +328,15 @@ static void uv_send_IPI_self(int vector)
319 apic_write(APIC_SELF_IPI, vector); 328 apic_write(APIC_SELF_IPI, vector);
320} 329}
321 330
322struct apic __refdata apic_x2apic_uv_x = { 331static int uv_probe(void)
332{
333 return apic == &apic_x2apic_uv_x;
334}
335
336static struct apic __refdata apic_x2apic_uv_x = {
323 337
324 .name = "UV large system", 338 .name = "UV large system",
325 .probe = NULL, 339 .probe = uv_probe,
326 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 340 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
327 .apic_id_registered = uv_apic_id_registered, 341 .apic_id_registered = uv_apic_id_registered,
328 342
@@ -642,18 +656,46 @@ void __cpuinit uv_cpu_init(void)
642 */ 656 */
643int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 657int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
644{ 658{
659 unsigned long real_uv_nmi;
660 int bid;
661
645 if (reason != DIE_NMIUNKNOWN) 662 if (reason != DIE_NMIUNKNOWN)
646 return NOTIFY_OK; 663 return NOTIFY_OK;
647 664
648 if (in_crash_kexec) 665 if (in_crash_kexec)
649 /* do nothing if entering the crash kernel */ 666 /* do nothing if entering the crash kernel */
650 return NOTIFY_OK; 667 return NOTIFY_OK;
668
669 /*
670 * Each blade has an MMR that indicates when an NMI has been sent
671 * to cpus on the blade. If an NMI is detected, atomically
672 * clear the MMR and update a per-blade NMI count used to
673 * cause each cpu on the blade to notice a new NMI.
674 */
675 bid = uv_numa_blade_id();
676 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
677
678 if (unlikely(real_uv_nmi)) {
679 spin_lock(&uv_blade_info[bid].nmi_lock);
680 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
681 if (real_uv_nmi) {
682 uv_blade_info[bid].nmi_count++;
683 uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
684 }
685 spin_unlock(&uv_blade_info[bid].nmi_lock);
686 }
687
688 if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
689 return NOTIFY_DONE;
690
691 __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
692
651 /* 693 /*
652 * Use a lock so only one cpu prints at a time 694 * Use a lock so only one cpu prints at a time.
653 * to prevent intermixed output. 695 * This prevents intermixed output.
654 */ 696 */
655 spin_lock(&uv_nmi_lock); 697 spin_lock(&uv_nmi_lock);
656 pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); 698 pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
657 dump_stack(); 699 dump_stack();
658 spin_unlock(&uv_nmi_lock); 700 spin_unlock(&uv_nmi_lock);
659 701
@@ -661,7 +703,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
661} 703}
662 704
663static struct notifier_block uv_dump_stack_nmi_nb = { 705static struct notifier_block uv_dump_stack_nmi_nb = {
664 .notifier_call = uv_handle_nmi 706 .notifier_call = uv_handle_nmi,
707 .priority = NMI_LOCAL_LOW_PRIOR - 1,
665}; 708};
666 709
667void uv_register_nmi_notifier(void) 710void uv_register_nmi_notifier(void)
@@ -720,8 +763,9 @@ void __init uv_system_init(void)
720 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 763 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
721 764
722 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 765 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
723 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 766 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
724 BUG_ON(!uv_blade_info); 767 BUG_ON(!uv_blade_info);
768
725 for (blade = 0; blade < uv_num_possible_blades(); blade++) 769 for (blade = 0; blade < uv_num_possible_blades(); blade++)
726 uv_blade_info[blade].memory_nid = -1; 770 uv_blade_info[blade].memory_nid = -1;
727 771
@@ -747,6 +791,7 @@ void __init uv_system_init(void)
747 uv_blade_info[blade].pnode = pnode; 791 uv_blade_info[blade].pnode = pnode;
748 uv_blade_info[blade].nr_possible_cpus = 0; 792 uv_blade_info[blade].nr_possible_cpus = 0;
749 uv_blade_info[blade].nr_online_cpus = 0; 793 uv_blade_info[blade].nr_online_cpus = 0;
794 spin_lock_init(&uv_blade_info[blade].nmi_lock);
750 max_pnode = max(pnode, max_pnode); 795 max_pnode = max(pnode, max_pnode);
751 blade++; 796 blade++;
752 } 797 }
@@ -821,3 +866,5 @@ void __init uv_system_init(void)
821 if (is_kdump_kernel()) 866 if (is_kdump_kernel())
822 reboot_type = BOOT_ACPI; 867 reboot_type = BOOT_ACPI;
823} 868}
869
870apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0b4be431c620..3bfa02235965 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h> 230#include <linux/acpi.h>
231#include <linux/syscore_ops.h>
231 232
232#include <asm/system.h> 233#include <asm/system.h>
233#include <asm/uaccess.h> 234#include <asm/uaccess.h>
@@ -1237,7 +1238,7 @@ static int suspend(int vetoable)
1237 dpm_suspend_noirq(PMSG_SUSPEND); 1238 dpm_suspend_noirq(PMSG_SUSPEND);
1238 1239
1239 local_irq_disable(); 1240 local_irq_disable();
1240 sysdev_suspend(PMSG_SUSPEND); 1241 syscore_suspend();
1241 1242
1242 local_irq_enable(); 1243 local_irq_enable();
1243 1244
@@ -1255,7 +1256,7 @@ static int suspend(int vetoable)
1255 apm_error("suspend", err); 1256 apm_error("suspend", err);
1256 err = (err == APM_SUCCESS) ? 0 : -EIO; 1257 err = (err == APM_SUCCESS) ? 0 : -EIO;
1257 1258
1258 sysdev_resume(); 1259 syscore_resume();
1259 local_irq_enable(); 1260 local_irq_enable();
1260 1261
1261 dpm_resume_noirq(PMSG_RESUME); 1262 dpm_resume_noirq(PMSG_RESUME);
@@ -1279,7 +1280,7 @@ static void standby(void)
1279 dpm_suspend_noirq(PMSG_SUSPEND); 1280 dpm_suspend_noirq(PMSG_SUSPEND);
1280 1281
1281 local_irq_disable(); 1282 local_irq_disable();
1282 sysdev_suspend(PMSG_SUSPEND); 1283 syscore_suspend();
1283 local_irq_enable(); 1284 local_irq_enable();
1284 1285
1285 err = set_system_power_state(APM_STATE_STANDBY); 1286 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1287,7 +1288,7 @@ static void standby(void)
1287 apm_error("standby", err); 1288 apm_error("standby", err);
1288 1289
1289 local_irq_disable(); 1290 local_irq_disable();
1290 sysdev_resume(); 1291 syscore_resume();
1291 local_irq_enable(); 1292 local_irq_enable();
1292 1293
1293 dpm_resume_noirq(PMSG_RESUME); 1294 dpm_resume_noirq(PMSG_RESUME);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a01..6042981d0309 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
30 30
31obj-$(CONFIG_X86_MCE) += mcheck/ 31obj-$(CONFIG_X86_MCE) += mcheck/
32obj-$(CONFIG_MTRR) += mtrr/ 32obj-$(CONFIG_MTRR) += mtrr/
33obj-$(CONFIG_CPU_FREQ) += cpufreq/
34 33
35obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 34obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
36 35
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3ecece0217ef..8f5cabb3c5b0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -613,8 +613,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
613#endif 613#endif
614 614
615 /* As a rule processors have APIC timer running in deep C states */ 615 /* As a rule processors have APIC timer running in deep C states */
616 if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) 616 if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400))
617 set_cpu_cap(c, X86_FEATURE_ARAT); 617 set_cpu_cap(c, X86_FEATURE_ARAT);
618
619 /*
620 * Disable GART TLB Walk Errors on Fam10h. We do this here
621 * because this is always needed when GART is enabled, even in a
622 * kernel which has no MCE support built in.
623 */
624 if (c->x86 == 0x10) {
625 /*
626 * BIOS should disable GartTlbWlk Errors themself. If
627 * it doesn't do it here as suggested by the BKDG.
628 *
629 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
630 */
631 u64 mask;
632 int err;
633
634 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
635 if (err == 0) {
636 mask |= (1 << 10);
637 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
638 }
639 }
618} 640}
619 641
620#ifdef CONFIG_X86_32 642#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a45..c8b41623377f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
254} 254}
255#endif 255#endif
256 256
257static int disable_smep __cpuinitdata;
258static __init int setup_disable_smep(char *arg)
259{
260 disable_smep = 1;
261 return 1;
262}
263__setup("nosmep", setup_disable_smep);
264
265static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
266{
267 if (cpu_has(c, X86_FEATURE_SMEP)) {
268 if (unlikely(disable_smep)) {
269 setup_clear_cpu_cap(X86_FEATURE_SMEP);
270 clear_in_cr4(X86_CR4_SMEP);
271 } else
272 set_in_cr4(X86_CR4_SMEP);
273 }
274}
275
257/* 276/*
258 * Some CPU features depend on higher CPUID levels, which may not always 277 * Some CPU features depend on higher CPUID levels, which may not always
259 * be available due to CPUID level capping or broken virtualization 278 * be available due to CPUID level capping or broken virtualization
@@ -565,8 +584,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
565 584
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); 585 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567 586
568 if (eax > 0) 587 c->x86_capability[9] = ebx;
569 c->x86_capability[9] = ebx;
570 } 588 }
571 589
572 /* AMD-defined flags: level 0x80000001 */ 590 /* AMD-defined flags: level 0x80000001 */
@@ -668,6 +686,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
668 c->cpu_index = 0; 686 c->cpu_index = 0;
669#endif 687#endif
670 filter_cpuid_features(c, false); 688 filter_cpuid_features(c, false);
689
690 setup_smep(c);
671} 691}
672 692
673void __init early_cpu_init(void) 693void __init early_cpu_init(void)
@@ -753,6 +773,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
753#endif 773#endif
754 } 774 }
755 775
776 setup_smep(c);
777
756 get_model_name(c); /* Default name */ 778 get_model_name(c); /* Default name */
757 779
758 detect_nopl(c); 780 detect_nopl(c);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
27config X86_ACPI_CPUFREQ
28 tristate "ACPI Processor P-States driver"
29 select CPU_FREQ_TABLE
30 depends on ACPI_PROCESSOR
31 help
32 This driver adds a CPUFreq driver which utilizes the ACPI
33 Processor Performance States.
34 This driver also supports Intel Enhanced Speedstep.
35
36 To compile this driver as a module, choose M here: the
37 module will be called acpi-cpufreq.
38
39 For details, take a look at <file:Documentation/cpu-freq/>.
40
41 If in doubt, say N.
42
43config ELAN_CPUFREQ
44 tristate "AMD Elan SC400 and SC410"
45 select CPU_FREQ_TABLE
46 depends on X86_ELAN
47 ---help---
48 This adds the CPUFreq driver for AMD Elan SC400 and SC410
49 processors.
50
51 You need to specify the processor maximum speed as boot
52 parameter: elanfreq=maxspeed (in kHz) or as module
53 parameter "max_freq".
54
55 For details, take a look at <file:Documentation/cpu-freq/>.
56
57 If in doubt, say N.
58
59config SC520_CPUFREQ
60 tristate "AMD Elan SC520"
61 select CPU_FREQ_TABLE
62 depends on X86_ELAN
63 ---help---
64 This adds the CPUFreq driver for AMD Elan SC520 processor.
65
66 For details, take a look at <file:Documentation/cpu-freq/>.
67
68 If in doubt, say N.
69
70
71config X86_POWERNOW_K6
72 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
73 select CPU_FREQ_TABLE
74 depends on X86_32
75 help
76 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
77 AMD K6-3+ processors.
78
79 For details, take a look at <file:Documentation/cpu-freq/>.
80
81 If in doubt, say N.
82
83config X86_POWERNOW_K7
84 tristate "AMD Mobile Athlon/Duron PowerNow!"
85 select CPU_FREQ_TABLE
86 depends on X86_32
87 help
88 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
89
90 For details, take a look at <file:Documentation/cpu-freq/>.
91
92 If in doubt, say N.
93
94config X86_POWERNOW_K7_ACPI
95 bool
96 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
97 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
98 depends on X86_32
99 default y
100
101config X86_POWERNOW_K8
102 tristate "AMD Opteron/Athlon64 PowerNow!"
103 select CPU_FREQ_TABLE
104 depends on ACPI && ACPI_PROCESSOR
105 help
106 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
107
108 To compile this driver as a module, choose M here: the
109 module will be called powernow-k8.
110
111 For details, take a look at <file:Documentation/cpu-freq/>.
112
113config X86_GX_SUSPMOD
114 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
115 depends on X86_32 && PCI
116 help
117 This add the CPUFreq driver for NatSemi Geode processors which
118 support suspend modulation.
119
120 For details, take a look at <file:Documentation/cpu-freq/>.
121
122 If in doubt, say N.
123
124config X86_SPEEDSTEP_CENTRINO
125 tristate "Intel Enhanced SpeedStep (deprecated)"
126 select CPU_FREQ_TABLE
127 select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
128 depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
129 help
130 This is deprecated and this functionality is now merged into
131 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
132 speedstep_centrino.
133 This adds the CPUFreq driver for Enhanced SpeedStep enabled
134 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
135 or 64bit enabled Intel Xeons.
136
137 To compile this driver as a module, choose M here: the
138 module will be called speedstep-centrino.
139
140 For details, take a look at <file:Documentation/cpu-freq/>.
141
142 If in doubt, say N.
143
144config X86_SPEEDSTEP_CENTRINO_TABLE
145 bool "Built-in tables for Banias CPUs"
146 depends on X86_32 && X86_SPEEDSTEP_CENTRINO
147 default y
148 help
149 Use built-in tables for Banias CPUs if ACPI encoding
150 is not available.
151
152 If in doubt, say N.
153
154config X86_SPEEDSTEP_ICH
155 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
156 select CPU_FREQ_TABLE
157 depends on X86_32
158 help
159 This adds the CPUFreq driver for certain mobile Intel Pentium III
160 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
161 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
162 ICH3 or ICH4 southbridge.
163
164 For details, take a look at <file:Documentation/cpu-freq/>.
165
166 If in doubt, say N.
167
168config X86_SPEEDSTEP_SMI
169 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
170 select CPU_FREQ_TABLE
171 depends on X86_32 && EXPERIMENTAL
172 help
173 This adds the CPUFreq driver for certain mobile Intel Pentium III
174 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
175 on systems which have an Intel 440BX/ZX/MX southbridge.
176
177 For details, take a look at <file:Documentation/cpu-freq/>.
178
179 If in doubt, say N.
180
181config X86_P4_CLOCKMOD
182 tristate "Intel Pentium 4 clock modulation"
183 select CPU_FREQ_TABLE
184 help
185 This adds the CPUFreq driver for Intel Pentium 4 / XEON
186 processors. When enabled it will lower CPU temperature by skipping
187 clocks.
188
189 This driver should be only used in exceptional
190 circumstances when very low power is needed because it causes severe
191 slowdowns and noticeable latencies. Normally Speedstep should be used
192 instead.
193
194 To compile this driver as a module, choose M here: the
195 module will be called p4-clockmod.
196
197 For details, take a look at <file:Documentation/cpu-freq/>.
198
199 Unless you are absolutely sure say N.
200
201config X86_CPUFREQ_NFORCE2
202 tristate "nVidia nForce2 FSB changing"
203 depends on X86_32 && EXPERIMENTAL
204 help
205 This adds the CPUFreq driver for FSB changing on nVidia nForce2
206 platforms.
207
208 For details, take a look at <file:Documentation/cpu-freq/>.
209
210 If in doubt, say N.
211
212config X86_LONGRUN
213 tristate "Transmeta LongRun"
214 depends on X86_32
215 help
216 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
217 which support LongRun.
218
219 For details, take a look at <file:Documentation/cpu-freq/>.
220
221 If in doubt, say N.
222
223config X86_LONGHAUL
224 tristate "VIA Cyrix III Longhaul"
225 select CPU_FREQ_TABLE
226 depends on X86_32 && ACPI_PROCESSOR
227 help
228 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
229 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
230 processors.
231
232 For details, take a look at <file:Documentation/cpu-freq/>.
233
234 If in doubt, say N.
235
236config X86_E_POWERSAVER
237 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
238 select CPU_FREQ_TABLE
239 depends on X86_32 && EXPERIMENTAL
240 help
241 This adds the CPUFreq driver for VIA C7 processors. However, this driver
242 does not have any safeguards to prevent operating the CPU out of spec
243 and is thus considered dangerous. Please use the regular ACPI cpufreq
244 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
245
246 If in doubt, say N.
247
248comment "shared options"
249
250config X86_SPEEDSTEP_LIB
251 tristate
252 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
253
254config X86_SPEEDSTEP_RELAXED_CAP_CHECK
255 bool "Relaxed speedstep capability checks"
256 depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
257 help
258 Don't perform all checks for a speedstep capable system which would
259 normally be done. Some ancient or strange systems, though speedstep
260 capable, don't always indicate that they are speedstep capable. This
261 option lets the probing code bypass some of those checks if the
262 parameter "relaxed_check=1" is passed to the module.
263
264endif # CPU_FREQ
265
266endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
11obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
12obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
13obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
14obj-$(CONFIG_X86_LONGRUN) += longrun.o
15obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
16obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
17obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
18obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
19obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
20obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
21obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index a2baafb2fe6d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,776 +0,0 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or (at
14 * your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24 *
25 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 */
27
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
33#include <linux/cpufreq.h>
34#include <linux/compiler.h>
35#include <linux/dmi.h>
36#include <linux/slab.h>
37
38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
43#include <acpi/processor.h>
44
45#include <asm/msr.h>
46#include <asm/processor.h>
47#include <asm/cpufeature.h>
48#include "mperf.h"
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg)
52
53MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
54MODULE_DESCRIPTION("ACPI Processor P-States Driver");
55MODULE_LICENSE("GPL");
56
57enum {
58 UNDEFINED_CAPABLE = 0,
59 SYSTEM_INTEL_MSR_CAPABLE,
60 SYSTEM_IO_CAPABLE,
61};
62
63#define INTEL_MSR_RANGE (0xffff)
64
65struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data;
67 struct cpufreq_frequency_table *freq_table;
68 unsigned int resume;
69 unsigned int cpu_feature;
70};
71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73
74/* acpi_perf_data is a pointer to percpu data. */
75static struct acpi_processor_performance __percpu *acpi_perf_data;
76
77static struct cpufreq_driver acpi_cpufreq_driver;
78
79static unsigned int acpi_pstate_strict;
80
81static int check_est_cpu(unsigned int cpuid)
82{
83 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
84
85 return cpu_has(cpu, X86_FEATURE_EST);
86}
87
88static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
89{
90 struct acpi_processor_performance *perf;
91 int i;
92
93 perf = data->acpi_data;
94
95 for (i = 0; i < perf->state_count; i++) {
96 if (value == perf->states[i].status)
97 return data->freq_table[i].frequency;
98 }
99 return 0;
100}
101
102static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
103{
104 int i;
105 struct acpi_processor_performance *perf;
106
107 msr &= INTEL_MSR_RANGE;
108 perf = data->acpi_data;
109
110 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
111 if (msr == perf->states[data->freq_table[i].index].status)
112 return data->freq_table[i].frequency;
113 }
114 return data->freq_table[0].frequency;
115}
116
117static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
118{
119 switch (data->cpu_feature) {
120 case SYSTEM_INTEL_MSR_CAPABLE:
121 return extract_msr(val, data);
122 case SYSTEM_IO_CAPABLE:
123 return extract_io(val, data);
124 default:
125 return 0;
126 }
127}
128
129struct msr_addr {
130 u32 reg;
131};
132
133struct io_addr {
134 u16 port;
135 u8 bit_width;
136};
137
138struct drv_cmd {
139 unsigned int type;
140 const struct cpumask *mask;
141 union {
142 struct msr_addr msr;
143 struct io_addr io;
144 } addr;
145 u32 val;
146};
147
148/* Called via smp_call_function_single(), on the target CPU */
149static void do_drv_read(void *_cmd)
150{
151 struct drv_cmd *cmd = _cmd;
152 u32 h;
153
154 switch (cmd->type) {
155 case SYSTEM_INTEL_MSR_CAPABLE:
156 rdmsr(cmd->addr.msr.reg, cmd->val, h);
157 break;
158 case SYSTEM_IO_CAPABLE:
159 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
160 &cmd->val,
161 (u32)cmd->addr.io.bit_width);
162 break;
163 default:
164 break;
165 }
166}
167
168/* Called via smp_call_function_many(), on the target CPUs */
169static void do_drv_write(void *_cmd)
170{
171 struct drv_cmd *cmd = _cmd;
172 u32 lo, hi;
173
174 switch (cmd->type) {
175 case SYSTEM_INTEL_MSR_CAPABLE:
176 rdmsr(cmd->addr.msr.reg, lo, hi);
177 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
178 wrmsr(cmd->addr.msr.reg, lo, hi);
179 break;
180 case SYSTEM_IO_CAPABLE:
181 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
182 cmd->val,
183 (u32)cmd->addr.io.bit_width);
184 break;
185 default:
186 break;
187 }
188}
189
190static void drv_read(struct drv_cmd *cmd)
191{
192 int err;
193 cmd->val = 0;
194
195 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
196 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
197}
198
199static void drv_write(struct drv_cmd *cmd)
200{
201 int this_cpu;
202
203 this_cpu = get_cpu();
204 if (cpumask_test_cpu(this_cpu, cmd->mask))
205 do_drv_write(cmd);
206 smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
207 put_cpu();
208}
209
210static u32 get_cur_val(const struct cpumask *mask)
211{
212 struct acpi_processor_performance *perf;
213 struct drv_cmd cmd;
214
215 if (unlikely(cpumask_empty(mask)))
216 return 0;
217
218 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
219 case SYSTEM_INTEL_MSR_CAPABLE:
220 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
221 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
222 break;
223 case SYSTEM_IO_CAPABLE:
224 cmd.type = SYSTEM_IO_CAPABLE;
225 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
226 cmd.addr.io.port = perf->control_register.address;
227 cmd.addr.io.bit_width = perf->control_register.bit_width;
228 break;
229 default:
230 return 0;
231 }
232
233 cmd.mask = mask;
234 drv_read(&cmd);
235
236 dprintk("get_cur_val = %u\n", cmd.val);
237
238 return cmd.val;
239}
240
241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
242{
243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
244 unsigned int freq;
245 unsigned int cached_freq;
246
247 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
248
249 if (unlikely(data == NULL ||
250 data->acpi_data == NULL || data->freq_table == NULL)) {
251 return 0;
252 }
253
254 cached_freq = data->freq_table[data->acpi_data->state].frequency;
255 freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
256 if (freq != cached_freq) {
257 /*
258 * The dreaded BIOS frequency change behind our back.
259 * Force set the frequency on next target call.
260 */
261 data->resume = 1;
262 }
263
264 dprintk("cur freq = %u\n", freq);
265
266 return freq;
267}
268
269static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
270 struct acpi_cpufreq_data *data)
271{
272 unsigned int cur_freq;
273 unsigned int i;
274
275 for (i = 0; i < 100; i++) {
276 cur_freq = extract_freq(get_cur_val(mask), data);
277 if (cur_freq == freq)
278 return 1;
279 udelay(10);
280 }
281 return 0;
282}
283
284static int acpi_cpufreq_target(struct cpufreq_policy *policy,
285 unsigned int target_freq, unsigned int relation)
286{
287 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
288 struct acpi_processor_performance *perf;
289 struct cpufreq_freqs freqs;
290 struct drv_cmd cmd;
291 unsigned int next_state = 0; /* Index into freq_table */
292 unsigned int next_perf_state = 0; /* Index into perf table */
293 unsigned int i;
294 int result = 0;
295
296 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
297
298 if (unlikely(data == NULL ||
299 data->acpi_data == NULL || data->freq_table == NULL)) {
300 return -ENODEV;
301 }
302
303 perf = data->acpi_data;
304 result = cpufreq_frequency_table_target(policy,
305 data->freq_table,
306 target_freq,
307 relation, &next_state);
308 if (unlikely(result)) {
309 result = -ENODEV;
310 goto out;
311 }
312
313 next_perf_state = data->freq_table[next_state].index;
314 if (perf->state == next_perf_state) {
315 if (unlikely(data->resume)) {
316 dprintk("Called after resume, resetting to P%d\n",
317 next_perf_state);
318 data->resume = 0;
319 } else {
320 dprintk("Already at target state (P%d)\n",
321 next_perf_state);
322 goto out;
323 }
324 }
325
326 switch (data->cpu_feature) {
327 case SYSTEM_INTEL_MSR_CAPABLE:
328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
329 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
330 cmd.val = (u32) perf->states[next_perf_state].control;
331 break;
332 case SYSTEM_IO_CAPABLE:
333 cmd.type = SYSTEM_IO_CAPABLE;
334 cmd.addr.io.port = perf->control_register.address;
335 cmd.addr.io.bit_width = perf->control_register.bit_width;
336 cmd.val = (u32) perf->states[next_perf_state].control;
337 break;
338 default:
339 result = -ENODEV;
340 goto out;
341 }
342
343 /* cpufreq holds the hotplug lock, so we are safe from here on */
344 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
345 cmd.mask = policy->cpus;
346 else
347 cmd.mask = cpumask_of(policy->cpu);
348
349 freqs.old = perf->states[perf->state].core_frequency * 1000;
350 freqs.new = data->freq_table[next_state].frequency;
351 for_each_cpu(i, policy->cpus) {
352 freqs.cpu = i;
353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
354 }
355
356 drv_write(&cmd);
357
358 if (acpi_pstate_strict) {
359 if (!check_freqs(cmd.mask, freqs.new, data)) {
360 dprintk("acpi_cpufreq_target failed (%d)\n",
361 policy->cpu);
362 result = -EAGAIN;
363 goto out;
364 }
365 }
366
367 for_each_cpu(i, policy->cpus) {
368 freqs.cpu = i;
369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
370 }
371 perf->state = next_perf_state;
372
373out:
374 return result;
375}
376
377static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
378{
379 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
380
381 dprintk("acpi_cpufreq_verify\n");
382
383 return cpufreq_frequency_table_verify(policy, data->freq_table);
384}
385
386static unsigned long
387acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
388{
389 struct acpi_processor_performance *perf = data->acpi_data;
390
391 if (cpu_khz) {
392 /* search the closest match to cpu_khz */
393 unsigned int i;
394 unsigned long freq;
395 unsigned long freqn = perf->states[0].core_frequency * 1000;
396
397 for (i = 0; i < (perf->state_count-1); i++) {
398 freq = freqn;
399 freqn = perf->states[i+1].core_frequency * 1000;
400 if ((2 * cpu_khz) > (freqn + freq)) {
401 perf->state = i;
402 return freq;
403 }
404 }
405 perf->state = perf->state_count-1;
406 return freqn;
407 } else {
408 /* assume CPU is at P0... */
409 perf->state = 0;
410 return perf->states[0].core_frequency * 1000;
411 }
412}
413
414static void free_acpi_perf_data(void)
415{
416 unsigned int i;
417
418 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
419 for_each_possible_cpu(i)
420 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
421 ->shared_cpu_map);
422 free_percpu(acpi_perf_data);
423}
424
425/*
426 * acpi_cpufreq_early_init - initialize ACPI P-States library
427 *
428 * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
429 * in order to determine correct frequency and voltage pairings. We can
430 * do _PDC and _PSD and find out the processor dependency for the
431 * actual init that will happen later...
432 */
433static int __init acpi_cpufreq_early_init(void)
434{
435 unsigned int i;
436 dprintk("acpi_cpufreq_early_init\n");
437
438 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
439 if (!acpi_perf_data) {
440 dprintk("Memory allocation error for acpi_perf_data.\n");
441 return -ENOMEM;
442 }
443 for_each_possible_cpu(i) {
444 if (!zalloc_cpumask_var_node(
445 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
446 GFP_KERNEL, cpu_to_node(i))) {
447
448 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
449 free_acpi_perf_data();
450 return -ENOMEM;
451 }
452 }
453
454 /* Do initialization in ACPI core */
455 acpi_processor_preregister_performance(acpi_perf_data);
456 return 0;
457}
458
459#ifdef CONFIG_SMP
460/*
461 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
462 * or do it in BIOS firmware and won't inform about it to OS. If not
463 * detected, this has a side effect of making CPU run at a different speed
464 * than OS intended it to run at. Detect it and handle it cleanly.
465 */
466static int bios_with_sw_any_bug;
467
468static int sw_any_bug_found(const struct dmi_system_id *d)
469{
470 bios_with_sw_any_bug = 1;
471 return 0;
472}
473
474static const struct dmi_system_id sw_any_bug_dmi_table[] = {
475 {
476 .callback = sw_any_bug_found,
477 .ident = "Supermicro Server X6DLP",
478 .matches = {
479 DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
480 DMI_MATCH(DMI_BIOS_VERSION, "080010"),
481 DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
482 },
483 },
484 { }
485};
486
487static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
488{
489 /* Intel Xeon Processor 7100 Series Specification Update
490 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
491 * AL30: A Machine Check Exception (MCE) Occurring during an
492 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
493 * Both Processor Cores to Lock Up. */
494 if (c->x86_vendor == X86_VENDOR_INTEL) {
495 if ((c->x86 == 15) &&
496 (c->x86_model == 6) &&
497 (c->x86_mask == 8)) {
498 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
499 "Xeon(R) 7100 Errata AL30, processors may "
500 "lock up on frequency changes: disabling "
501 "acpi-cpufreq.\n");
502 return -ENODEV;
503 }
504 }
505 return 0;
506}
507#endif
508
509static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
510{
511 unsigned int i;
512 unsigned int valid_states = 0;
513 unsigned int cpu = policy->cpu;
514 struct acpi_cpufreq_data *data;
515 unsigned int result = 0;
516 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
517 struct acpi_processor_performance *perf;
518#ifdef CONFIG_SMP
519 static int blacklisted;
520#endif
521
522 dprintk("acpi_cpufreq_cpu_init\n");
523
524#ifdef CONFIG_SMP
525 if (blacklisted)
526 return blacklisted;
527 blacklisted = acpi_cpufreq_blacklist(c);
528 if (blacklisted)
529 return blacklisted;
530#endif
531
532 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
533 if (!data)
534 return -ENOMEM;
535
536 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
537 per_cpu(acfreq_data, cpu) = data;
538
539 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
540 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
541
542 result = acpi_processor_register_performance(data->acpi_data, cpu);
543 if (result)
544 goto err_free;
545
546 perf = data->acpi_data;
547 policy->shared_type = perf->shared_type;
548
549 /*
550 * Will let policy->cpus know about dependency only when software
551 * coordination is required.
552 */
553 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
554 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
555 cpumask_copy(policy->cpus, perf->shared_cpu_map);
556 }
557 cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
558
559#ifdef CONFIG_SMP
560 dmi_check_system(sw_any_bug_dmi_table);
561 if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
562 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
563 cpumask_copy(policy->cpus, cpu_core_mask(cpu));
564 }
565#endif
566
567 /* capability check */
568 if (perf->state_count <= 1) {
569 dprintk("No P-States\n");
570 result = -ENODEV;
571 goto err_unreg;
572 }
573
574 if (perf->control_register.space_id != perf->status_register.space_id) {
575 result = -ENODEV;
576 goto err_unreg;
577 }
578
579 switch (perf->control_register.space_id) {
580 case ACPI_ADR_SPACE_SYSTEM_IO:
581 dprintk("SYSTEM IO addr space\n");
582 data->cpu_feature = SYSTEM_IO_CAPABLE;
583 break;
584 case ACPI_ADR_SPACE_FIXED_HARDWARE:
585 dprintk("HARDWARE addr space\n");
586 if (!check_est_cpu(cpu)) {
587 result = -ENODEV;
588 goto err_unreg;
589 }
590 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
591 break;
592 default:
593 dprintk("Unknown addr space %d\n",
594 (u32) (perf->control_register.space_id));
595 result = -ENODEV;
596 goto err_unreg;
597 }
598
599 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
600 (perf->state_count+1), GFP_KERNEL);
601 if (!data->freq_table) {
602 result = -ENOMEM;
603 goto err_unreg;
604 }
605
606 /* detect transition latency */
607 policy->cpuinfo.transition_latency = 0;
608 for (i = 0; i < perf->state_count; i++) {
609 if ((perf->states[i].transition_latency * 1000) >
610 policy->cpuinfo.transition_latency)
611 policy->cpuinfo.transition_latency =
612 perf->states[i].transition_latency * 1000;
613 }
614
615 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
616 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
617 policy->cpuinfo.transition_latency > 20 * 1000) {
618 policy->cpuinfo.transition_latency = 20 * 1000;
619 printk_once(KERN_INFO
620 "P-state transition latency capped at 20 uS\n");
621 }
622
623 /* table init */
624 for (i = 0; i < perf->state_count; i++) {
625 if (i > 0 && perf->states[i].core_frequency >=
626 data->freq_table[valid_states-1].frequency / 1000)
627 continue;
628
629 data->freq_table[valid_states].index = i;
630 data->freq_table[valid_states].frequency =
631 perf->states[i].core_frequency * 1000;
632 valid_states++;
633 }
634 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
635 perf->state = 0;
636
637 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
638 if (result)
639 goto err_freqfree;
640
641 if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
642 printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
643
644 switch (perf->control_register.space_id) {
645 case ACPI_ADR_SPACE_SYSTEM_IO:
646 /* Current speed is unknown and not detectable by IO port */
647 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
648 break;
649 case ACPI_ADR_SPACE_FIXED_HARDWARE:
650 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
651 policy->cur = get_cur_freq_on_cpu(cpu);
652 break;
653 default:
654 break;
655 }
656
657 /* notify BIOS that we exist */
658 acpi_processor_notify_smm(THIS_MODULE);
659
660 /* Check for APERF/MPERF support in hardware */
661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
663
664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
665 for (i = 0; i < perf->state_count; i++)
666 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
667 (i == perf->state ? '*' : ' '), i,
668 (u32) perf->states[i].core_frequency,
669 (u32) perf->states[i].power,
670 (u32) perf->states[i].transition_latency);
671
672 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
673
674 /*
675 * the first call to ->target() should result in us actually
676 * writing something to the appropriate registers.
677 */
678 data->resume = 1;
679
680 return result;
681
682err_freqfree:
683 kfree(data->freq_table);
684err_unreg:
685 acpi_processor_unregister_performance(perf, cpu);
686err_free:
687 kfree(data);
688 per_cpu(acfreq_data, cpu) = NULL;
689
690 return result;
691}
692
693static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
694{
695 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
696
697 dprintk("acpi_cpufreq_cpu_exit\n");
698
699 if (data) {
700 cpufreq_frequency_table_put_attr(policy->cpu);
701 per_cpu(acfreq_data, policy->cpu) = NULL;
702 acpi_processor_unregister_performance(data->acpi_data,
703 policy->cpu);
704 kfree(data->freq_table);
705 kfree(data);
706 }
707
708 return 0;
709}
710
711static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
712{
713 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
714
715 dprintk("acpi_cpufreq_resume\n");
716
717 data->resume = 1;
718
719 return 0;
720}
721
722static struct freq_attr *acpi_cpufreq_attr[] = {
723 &cpufreq_freq_attr_scaling_available_freqs,
724 NULL,
725};
726
727static struct cpufreq_driver acpi_cpufreq_driver = {
728 .verify = acpi_cpufreq_verify,
729 .target = acpi_cpufreq_target,
730 .bios_limit = acpi_processor_get_bios_limit,
731 .init = acpi_cpufreq_cpu_init,
732 .exit = acpi_cpufreq_cpu_exit,
733 .resume = acpi_cpufreq_resume,
734 .name = "acpi-cpufreq",
735 .owner = THIS_MODULE,
736 .attr = acpi_cpufreq_attr,
737};
738
739static int __init acpi_cpufreq_init(void)
740{
741 int ret;
742
743 if (acpi_disabled)
744 return 0;
745
746 dprintk("acpi_cpufreq_init\n");
747
748 ret = acpi_cpufreq_early_init();
749 if (ret)
750 return ret;
751
752 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
753 if (ret)
754 free_acpi_perf_data();
755
756 return ret;
757}
758
759static void __exit acpi_cpufreq_exit(void)
760{
761 dprintk("acpi_cpufreq_exit\n");
762
763 cpufreq_unregister_driver(&acpi_cpufreq_driver);
764
765 free_percpu(acpi_perf_data);
766}
767
768module_param(acpi_pstate_strict, uint, 0644);
769MODULE_PARM_DESC(acpi_pstate_strict,
770 "value 0 or non-zero. non-zero -> strict ACPI checks are "
771 "performed during frequency changes.");
772
773late_initcall(acpi_cpufreq_init);
774module_exit(acpi_cpufreq_exit);
775
776MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 141abebc4516..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29/* #define NFORCE2_DELAY 10 */
30
31/*
32 * nforce2_chipset:
33 * FSB is changed using the chipset
34 */
35static struct pci_dev *nforce2_dev;
36
37/* fid:
38 * multiplier * 10
39 */
40static int fid;
41
42/* min_fsb, max_fsb:
43 * minimum and maximum FSB (= FSB at boot time)
44 */
45static int min_fsb;
46static int max_fsb;
47
48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
50MODULE_LICENSE("GPL");
51
52module_param(fid, int, 0444);
53module_param(min_fsb, int, 0444);
54
55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50");
58
59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
62
63/**
64 * nforce2_calc_fsb - calculate FSB
65 * @pll: PLL value
66 *
67 * Calculates FSB from PLL value
68 */
69static int nforce2_calc_fsb(int pll)
70{
71 unsigned char mul, div;
72
73 mul = (pll >> 8) & 0xff;
74 div = pll & 0xff;
75
76 if (div > 0)
77 return NFORCE2_XTAL * mul / div;
78
79 return 0;
80}
81
82/**
83 * nforce2_calc_pll - calculate PLL value
84 * @fsb: FSB
85 *
86 * Calculate PLL value for given FSB
87 */
88static int nforce2_calc_pll(unsigned int fsb)
89{
90 unsigned char xmul, xdiv;
91 unsigned char mul = 0, div = 0;
92 int tried = 0;
93
94 /* Try to calculate multiplier and divider up to 4 times */
95 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
96 for (xdiv = 2; xdiv <= 0x80; xdiv++)
97 for (xmul = 1; xmul <= 0xfe; xmul++)
98 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
99 fsb + tried) {
100 mul = xmul;
101 div = xdiv;
102 }
103 tried++;
104 }
105
106 if ((mul == 0) || (div == 0))
107 return -1;
108
109 return NFORCE2_PLL(mul, div);
110}
111
112/**
113 * nforce2_write_pll - write PLL value to chipset
114 * @pll: PLL value
115 *
116 * Writes new FSB PLL value to chipset
117 */
118static void nforce2_write_pll(int pll)
119{
120 int temp;
121
122 /* Set the pll addr. to 0x00 */
123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
124
125 /* Now write the value in all 64 registers */
126 for (temp = 0; temp <= 0x3f; temp++)
127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
128
129 return;
130}
131
132/**
133 * nforce2_fsb_read - Read FSB
134 *
135 * Read FSB from chipset
136 * If bootfsb != 0, return FSB at boot-time
137 */
138static unsigned int nforce2_fsb_read(int bootfsb)
139{
140 struct pci_dev *nforce2_sub5;
141 u32 fsb, temp = 0;
142
143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
145 PCI_ANY_ID, PCI_ANY_ID, NULL);
146 if (!nforce2_sub5)
147 return 0;
148
149 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
150 fsb /= 1000000;
151
152 /* Check if PLL register is already set */
153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
154
155 if (bootfsb || !temp)
156 return fsb;
157
158 /* Use PLL register FSB value */
159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
160 fsb = nforce2_calc_fsb(temp);
161
162 return fsb;
163}
164
165/**
166 * nforce2_set_fsb - set new FSB
167 * @fsb: New FSB
168 *
169 * Sets new FSB
170 */
171static int nforce2_set_fsb(unsigned int fsb)
172{
173 u32 temp = 0;
174 unsigned int tfsb;
175 int diff;
176 int pll = 0;
177
178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
180 return -EINVAL;
181 }
182
183 tfsb = nforce2_fsb_read(0);
184 if (!tfsb) {
185 printk(KERN_ERR PFX "Error while reading the FSB\n");
186 return -EINVAL;
187 }
188
189 /* First write? Then set actual value */
190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
191 if (!temp) {
192 pll = nforce2_calc_pll(tfsb);
193
194 if (pll < 0)
195 return -EINVAL;
196
197 nforce2_write_pll(pll);
198 }
199
200 /* Enable write access */
201 temp = 0x01;
202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
203
204 diff = tfsb - fsb;
205
206 if (!diff)
207 return 0;
208
209 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
210 if (diff < 0)
211 tfsb++;
212 else
213 tfsb--;
214
215 /* Calculate the PLL reg. value */
216 pll = nforce2_calc_pll(tfsb);
217 if (pll == -1)
218 return -EINVAL;
219
220 nforce2_write_pll(pll);
221#ifdef NFORCE2_DELAY
222 mdelay(NFORCE2_DELAY);
223#endif
224 }
225
226 temp = 0x40;
227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
228
229 return 0;
230}
231
232/**
233 * nforce2_get - get the CPU frequency
234 * @cpu: CPU number
235 *
236 * Returns the CPU frequency
237 */
238static unsigned int nforce2_get(unsigned int cpu)
239{
240 if (cpu)
241 return 0;
242 return nforce2_fsb_read(0) * fid * 100;
243}
244
245/**
246 * nforce2_target - set a new CPUFreq policy
247 * @policy: new policy
248 * @target_freq: the target frequency
249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
251 *
252 * Sets a new CPUFreq policy.
253 */
254static int nforce2_target(struct cpufreq_policy *policy,
255 unsigned int target_freq, unsigned int relation)
256{
257/* unsigned long flags; */
258 struct cpufreq_freqs freqs;
259 unsigned int target_fsb;
260
261 if ((target_freq > policy->max) || (target_freq < policy->min))
262 return -EINVAL;
263
264 target_fsb = target_freq / (fid * 100);
265
266 freqs.old = nforce2_get(policy->cpu);
267 freqs.new = target_fsb * fid * 100;
268 freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
269
270 if (freqs.old == freqs.new)
271 return 0;
272
273 dprintk("Old CPU frequency %d kHz, new %d kHz\n",
274 freqs.old, freqs.new);
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 /* Disable IRQs */
279 /* local_irq_save(flags); */
280
281 if (nforce2_set_fsb(target_fsb) < 0)
282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
283 target_fsb);
284 else
285 dprintk("Changed FSB successfully to %d\n",
286 target_fsb);
287
288 /* Enable IRQs */
289 /* local_irq_restore(flags); */
290
291 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
292
293 return 0;
294}
295
296/**
297 * nforce2_verify - verifies a new CPUFreq policy
298 * @policy: new policy
299 */
300static int nforce2_verify(struct cpufreq_policy *policy)
301{
302 unsigned int fsb_pol_max;
303
304 fsb_pol_max = policy->max / (fid * 100);
305
306 if (policy->min < (fsb_pol_max * fid * 100))
307 policy->max = (fsb_pol_max + 1) * fid * 100;
308
309 cpufreq_verify_within_limits(policy,
310 policy->cpuinfo.min_freq,
311 policy->cpuinfo.max_freq);
312 return 0;
313}
314
315static int nforce2_cpu_init(struct cpufreq_policy *policy)
316{
317 unsigned int fsb;
318 unsigned int rfid;
319
320 /* capability check */
321 if (policy->cpu != 0)
322 return -ENODEV;
323
324 /* Get current FSB */
325 fsb = nforce2_fsb_read(0);
326
327 if (!fsb)
328 return -EIO;
329
330 /* FIX: Get FID from CPU */
331 if (!fid) {
332 if (!cpu_khz) {
333 printk(KERN_WARNING PFX
334 "cpu_khz not set, can't calculate multiplier!\n");
335 return -ENODEV;
336 }
337
338 fid = cpu_khz / (fsb * 100);
339 rfid = fid % 5;
340
341 if (rfid) {
342 if (rfid > 2)
343 fid += 5 - rfid;
344 else
345 fid -= rfid;
346 }
347 }
348
349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
350 fid / 10, fid % 10);
351
352 /* Set maximum FSB to FSB at boot time */
353 max_fsb = nforce2_fsb_read(1);
354
355 if (!max_fsb)
356 return -EIO;
357
358 if (!min_fsb)
359 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
360
361 if (min_fsb < NFORCE2_MIN_FSB)
362 min_fsb = NFORCE2_MIN_FSB;
363
364 /* cpuinfo and default policy values */
365 policy->cpuinfo.min_freq = min_fsb * fid * 100;
366 policy->cpuinfo.max_freq = max_fsb * fid * 100;
367 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
368 policy->cur = nforce2_get(policy->cpu);
369 policy->min = policy->cpuinfo.min_freq;
370 policy->max = policy->cpuinfo.max_freq;
371
372 return 0;
373}
374
375static int nforce2_cpu_exit(struct cpufreq_policy *policy)
376{
377 return 0;
378}
379
380static struct cpufreq_driver nforce2_driver = {
381 .name = "nforce2",
382 .verify = nforce2_verify,
383 .target = nforce2_target,
384 .get = nforce2_get,
385 .init = nforce2_cpu_init,
386 .exit = nforce2_cpu_exit,
387 .owner = THIS_MODULE,
388};
389
390/**
391 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
392 *
393 * Detects nForce2 A2 and C1 stepping
394 *
395 */
396static int nforce2_detect_chipset(void)
397{
398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
400 PCI_ANY_ID, PCI_ANY_ID, NULL);
401
402 if (nforce2_dev == NULL)
403 return -ENODEV;
404
405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
406 nforce2_dev->revision);
407 printk(KERN_INFO PFX
408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
410
411 return 0;
412}
413
414/**
415 * nforce2_init - initializes the nForce2 CPUFreq driver
416 *
417 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
418 * devices, -EINVAL on problems during initiatization, and zero on
419 * success.
420 */
421static int __init nforce2_init(void)
422{
423 /* TODO: do we need to detect the processor? */
424
425 /* detect chipset */
426 if (nforce2_detect_chipset()) {
427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
428 return -ENODEV;
429 }
430
431 return cpufreq_register_driver(&nforce2_driver);
432}
433
434/**
435 * nforce2_exit - unregisters cpufreq module
436 *
437 * Unregisters nForce2 FSB change support.
438 */
439static void __exit nforce2_exit(void)
440{
441 cpufreq_unregister_driver(&nforce2_driver);
442}
443
444module_init(nforce2_init);
445module_exit(nforce2_exit);
446
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
18
19#include <asm/msr.h>
20#include <asm/tsc.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26#define EPS_BRAND_C7D 4
27
28struct eps_cpu_data {
29 u32 fsb;
30 struct cpufreq_frequency_table freq_table[];
31};
32
33static struct eps_cpu_data *eps_cpu[NR_CPUS];
34
35
36static unsigned int eps_get(unsigned int cpu)
37{
38 struct eps_cpu_data *centaur;
39 u32 lo, hi;
40
41 if (cpu)
42 return 0;
43 centaur = eps_cpu[cpu];
44 if (centaur == NULL)
45 return 0;
46
47 /* Return current frequency */
48 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
49 return centaur->fsb * ((lo >> 8) & 0xff);
50}
51
52static int eps_set_state(struct eps_cpu_data *centaur,
53 unsigned int cpu,
54 u32 dest_state)
55{
56 struct cpufreq_freqs freqs;
57 u32 lo, hi;
58 int err = 0;
59 int i;
60
61 freqs.old = eps_get(cpu);
62 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
63 freqs.cpu = cpu;
64 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
65
66 /* Wait while CPU is busy */
67 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
68 i = 0;
69 while (lo & ((1 << 16) | (1 << 17))) {
70 udelay(16);
71 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
72 i++;
73 if (unlikely(i > 64)) {
74 err = -ENODEV;
75 goto postchange;
76 }
77 }
78 /* Set new multiplier and voltage */
79 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
80 /* Wait until transition end */
81 i = 0;
82 do {
83 udelay(16);
84 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
85 i++;
86 if (unlikely(i > 64)) {
87 err = -ENODEV;
88 goto postchange;
89 }
90 } while (lo & ((1 << 16) | (1 << 17)));
91
92 /* Return current frequency */
93postchange:
94 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
95 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
96
97#ifdef DEBUG
98 {
99 u8 current_multiplier, current_voltage;
100
101 /* Print voltage and multiplier */
102 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
103 current_voltage = lo & 0xff;
104 printk(KERN_INFO "eps: Current voltage = %dmV\n",
105 current_voltage * 16 + 700);
106 current_multiplier = (lo >> 8) & 0xff;
107 printk(KERN_INFO "eps: Current multiplier = %d\n",
108 current_multiplier);
109 }
110#endif
111 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
112 return err;
113}
114
115static int eps_target(struct cpufreq_policy *policy,
116 unsigned int target_freq,
117 unsigned int relation)
118{
119 struct eps_cpu_data *centaur;
120 unsigned int newstate = 0;
121 unsigned int cpu = policy->cpu;
122 unsigned int dest_state;
123 int ret;
124
125 if (unlikely(eps_cpu[cpu] == NULL))
126 return -ENODEV;
127 centaur = eps_cpu[cpu];
128
129 if (unlikely(cpufreq_frequency_table_target(policy,
130 &eps_cpu[cpu]->freq_table[0],
131 target_freq,
132 relation,
133 &newstate))) {
134 return -EINVAL;
135 }
136
137 /* Make frequency transition */
138 dest_state = centaur->freq_table[newstate].index & 0xffff;
139 ret = eps_set_state(centaur, cpu, dest_state);
140 if (ret)
141 printk(KERN_ERR "eps: Timeout!\n");
142 return ret;
143}
144
145static int eps_verify(struct cpufreq_policy *policy)
146{
147 return cpufreq_frequency_table_verify(policy,
148 &eps_cpu[policy->cpu]->freq_table[0]);
149}
150
151static int eps_cpu_init(struct cpufreq_policy *policy)
152{
153 unsigned int i;
154 u32 lo, hi;
155 u64 val;
156 u8 current_multiplier, current_voltage;
157 u8 max_multiplier, max_voltage;
158 u8 min_multiplier, min_voltage;
159 u8 brand = 0;
160 u32 fsb;
161 struct eps_cpu_data *centaur;
162 struct cpuinfo_x86 *c = &cpu_data(0);
163 struct cpufreq_frequency_table *f_table;
164 int k, step, voltage;
165 int ret;
166 int states;
167
168 if (policy->cpu != 0)
169 return -ENODEV;
170
171 /* Check brand */
172 printk(KERN_INFO "eps: Detected VIA ");
173
174 switch (c->x86_model) {
175 case 10:
176 rdmsr(0x1153, lo, hi);
177 brand = (((lo >> 2) ^ lo) >> 18) & 3;
178 printk(KERN_CONT "Model A ");
179 break;
180 case 13:
181 rdmsr(0x1154, lo, hi);
182 brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
183 printk(KERN_CONT "Model D ");
184 break;
185 }
186
187 switch (brand) {
188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n");
190 break;
191 case EPS_BRAND_C7:
192 printk(KERN_CONT "C7\n");
193 break;
194 case EPS_BRAND_EDEN:
195 printk(KERN_CONT "Eden\n");
196 break;
197 case EPS_BRAND_C7D:
198 printk(KERN_CONT "C7-D\n");
199 break;
200 case EPS_BRAND_C3:
201 printk(KERN_CONT "C3\n");
202 return -ENODEV;
203 break;
204 }
205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV;
215 }
216 }
217
218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
223 current_multiplier = (lo >> 8) & 0xff;
224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
225
226 /* Print limits */
227 max_voltage = hi & 0xff;
228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
230 max_multiplier = (hi >> 8) & 0xff;
231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
232 min_voltage = (hi >> 16) & 0xff;
233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
235 min_multiplier = (hi >> 24) & 0xff;
236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
237
238 /* Sanity checks */
239 if (current_multiplier == 0 || max_multiplier == 0
240 || min_multiplier == 0)
241 return -EINVAL;
242 if (current_multiplier > max_multiplier
243 || max_multiplier <= min_multiplier)
244 return -EINVAL;
245 if (current_voltage > 0x1f || max_voltage > 0x1f)
246 return -EINVAL;
247 if (max_voltage < min_voltage)
248 return -EINVAL;
249
250 /* Calc FSB speed */
251 fsb = cpu_khz / current_multiplier;
252 /* Calc number of p-states supported */
253 if (brand == EPS_BRAND_C7M)
254 states = max_multiplier - min_multiplier + 1;
255 else
256 states = 2;
257
258 /* Allocate private data and frequency table for current cpu */
259 centaur = kzalloc(sizeof(struct eps_cpu_data)
260 + (states + 1) * sizeof(struct cpufreq_frequency_table),
261 GFP_KERNEL);
262 if (!centaur)
263 return -ENOMEM;
264 eps_cpu[0] = centaur;
265
266 /* Copy basic values */
267 centaur->fsb = fsb;
268
269 /* Fill frequency and MSR value table */
270 f_table = &centaur->freq_table[0];
271 if (brand != EPS_BRAND_C7M) {
272 f_table[0].frequency = fsb * min_multiplier;
273 f_table[0].index = (min_multiplier << 8) | min_voltage;
274 f_table[1].frequency = fsb * max_multiplier;
275 f_table[1].index = (max_multiplier << 8) | max_voltage;
276 f_table[2].frequency = CPUFREQ_TABLE_END;
277 } else {
278 k = 0;
279 step = ((max_voltage - min_voltage) * 256)
280 / (max_multiplier - min_multiplier);
281 for (i = min_multiplier; i <= max_multiplier; i++) {
282 voltage = (k * step) / 256 + min_voltage;
283 f_table[k].frequency = fsb * i;
284 f_table[k].index = (i << 8) | voltage;
285 k++;
286 }
287 f_table[k].frequency = CPUFREQ_TABLE_END;
288 }
289
290 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
291 policy->cur = fsb * current_multiplier;
292
293 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
294 if (ret) {
295 kfree(centaur);
296 return ret;
297 }
298
299 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
300 return 0;
301}
302
303static int eps_cpu_exit(struct cpufreq_policy *policy)
304{
305 unsigned int cpu = policy->cpu;
306 struct eps_cpu_data *centaur;
307 u32 lo, hi;
308
309 if (eps_cpu[cpu] == NULL)
310 return -ENODEV;
311 centaur = eps_cpu[cpu];
312
313 /* Get max frequency */
314 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
315 /* Set max frequency */
316 eps_set_state(centaur, cpu, hi & 0xffff);
317 /* Bye */
318 cpufreq_frequency_table_put_attr(policy->cpu);
319 kfree(eps_cpu[cpu]);
320 eps_cpu[cpu] = NULL;
321 return 0;
322}
323
324static struct freq_attr *eps_attr[] = {
325 &cpufreq_freq_attr_scaling_available_freqs,
326 NULL,
327};
328
329static struct cpufreq_driver eps_driver = {
330 .verify = eps_verify,
331 .target = eps_target,
332 .init = eps_cpu_init,
333 .exit = eps_cpu_exit,
334 .get = eps_get,
335 .name = "e_powersaver",
336 .owner = THIS_MODULE,
337 .attr = eps_attr,
338};
339
340static int __init eps_init(void)
341{
342 struct cpuinfo_x86 *c = &cpu_data(0);
343
344 /* This driver will work only on Centaur C7 processors with
345 * Enhanced SpeedStep/PowerSaver registers */
346 if (c->x86_vendor != X86_VENDOR_CENTAUR
347 || c->x86 != 6 || c->x86_model < 10)
348 return -ENODEV;
349 if (!cpu_has(c, X86_FEATURE_EST))
350 return -ENODEV;
351
352 if (cpufreq_register_driver(&eps_driver))
353 return -EINVAL;
354 return 0;
355}
356
357static void __exit eps_exit(void)
358{
359 cpufreq_unregister_driver(&eps_driver);
360}
361
362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
364MODULE_LICENSE("GPL");
365
366module_init(eps_init);
367module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/delay.h>
24#include <linux/cpufreq.h>
25
26#include <asm/msr.h>
27#include <linux/timex.h>
28#include <linux/io.h>
29
30#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
31#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
32
33/* Module parameter */
34static int max_freq;
35
36struct s_elan_multiplier {
37 int clock; /* frequency in kHz */
38 int val40h; /* PMU Force Mode register */
39 int val80h; /* CPU Clock Speed Register */
40};
41
42/*
43 * It is important that the frequencies
44 * are listed in ascending order here!
45 */
46static struct s_elan_multiplier elan_multiplier[] = {
47 {1000, 0x02, 0x18},
48 {2000, 0x02, 0x10},
49 {4000, 0x02, 0x08},
50 {8000, 0x00, 0x00},
51 {16000, 0x00, 0x02},
52 {33000, 0x00, 0x04},
53 {66000, 0x01, 0x04},
54 {99000, 0x01, 0x05}
55};
56
57static struct cpufreq_frequency_table elanfreq_table[] = {
58 {0, 1000},
59 {1, 2000},
60 {2, 4000},
61 {3, 8000},
62 {4, 16000},
63 {5, 33000},
64 {6, 66000},
65 {7, 99000},
66 {0, CPUFREQ_TABLE_END},
67};
68
69
70/**
71 * elanfreq_get_cpu_frequency: determine current cpu speed
72 *
73 * Finds out at which frequency the CPU of the Elan SOC runs
74 * at the moment. Frequencies from 1 to 33 MHz are generated
75 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
76 * and have the rest of the chip running with 33 MHz.
77 */
78
79static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
80{
81 u8 clockspeed_reg; /* Clock Speed Register */
82
83 local_irq_disable();
84 outb_p(0x80, REG_CSCIR);
85 clockspeed_reg = inb_p(REG_CSCDR);
86 local_irq_enable();
87
88 if ((clockspeed_reg & 0xE0) == 0xE0)
89 return 0;
90
91 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
92 if ((clockspeed_reg & 0xE0) == 0xC0) {
93 if ((clockspeed_reg & 0x01) == 0)
94 return 66000;
95 else
96 return 99000;
97 }
98
99 /* 33 MHz is not 32 MHz... */
100 if ((clockspeed_reg & 0xE0) == 0xA0)
101 return 33000;
102
103 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
104}
105
106
107/**
108 * elanfreq_set_cpu_frequency: Change the CPU core frequency
109 * @cpu: cpu number
110 * @freq: frequency in kHz
111 *
112 * This function takes a frequency value and changes the CPU frequency
113 * according to this. Note that the frequency has to be checked by
114 * elanfreq_validatespeed() for correctness!
115 *
116 * There is no return value.
117 */
118
119static void elanfreq_set_cpu_state(unsigned int state)
120{
121 struct cpufreq_freqs freqs;
122
123 freqs.old = elanfreq_get_cpu_frequency(0);
124 freqs.new = elan_multiplier[state].clock;
125 freqs.cpu = 0; /* elanfreq.c is UP only driver */
126
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128
129 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
130 elan_multiplier[state].clock);
131
132
133 /*
134 * Access to the Elan's internal registers is indexed via
135 * 0x22: Chip Setup & Control Register Index Register (CSCI)
136 * 0x23: Chip Setup & Control Register Data Register (CSCD)
137 *
138 */
139
140 /*
141 * 0x40 is the Power Management Unit's Force Mode Register.
142 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
143 */
144
145 local_irq_disable();
146 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
147 outb_p(0x00, REG_CSCDR);
148 local_irq_enable(); /* wait till internal pipelines and */
149 udelay(1000); /* buffers have cleaned up */
150
151 local_irq_disable();
152
153 /* now, set the CPU clock speed register (0x80) */
154 outb_p(0x80, REG_CSCIR);
155 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
156
157 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
158 outb_p(0x40, REG_CSCIR);
159 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
160 udelay(10000);
161 local_irq_enable();
162
163 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
164};
165
166
167/**
168 * elanfreq_validatespeed: test if frequency range is valid
169 * @policy: the policy to validate
170 *
171 * This function checks if a given frequency range in kHz is valid
172 * for the hardware supported by the driver.
173 */
174
175static int elanfreq_verify(struct cpufreq_policy *policy)
176{
177 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
178}
179
180static int elanfreq_target(struct cpufreq_policy *policy,
181 unsigned int target_freq,
182 unsigned int relation)
183{
184 unsigned int newstate = 0;
185
186 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
187 target_freq, relation, &newstate))
188 return -EINVAL;
189
190 elanfreq_set_cpu_state(newstate);
191
192 return 0;
193}
194
195
196/*
197 * Module init and exit code
198 */
199
200static int elanfreq_cpu_init(struct cpufreq_policy *policy)
201{
202 struct cpuinfo_x86 *c = &cpu_data(0);
203 unsigned int i;
204 int result;
205
206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV;
210
211 /* max freq */
212 if (!max_freq)
213 max_freq = elanfreq_get_cpu_frequency(0);
214
215 /* table init */
216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 }
220
221 /* cpuinfo and default policy values */
222 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
223 policy->cur = elanfreq_get_cpu_frequency(0);
224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result)
227 return result;
228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0;
231}
232
233
234static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
235{
236 cpufreq_frequency_table_put_attr(policy->cpu);
237 return 0;
238}
239
240
241#ifndef MODULE
242/**
243 * elanfreq_setup - elanfreq command line parameter parsing
244 *
245 * elanfreq command line parameter. Use:
246 * elanfreq=66000
247 * to set the maximum CPU frequency to 66 MHz. Note that in
248 * case you do not give this boot parameter, the maximum
249 * frequency will fall back to _current_ CPU frequency which
250 * might be lower. If you build this as a module, use the
251 * max_freq module parameter instead.
252 */
253static int __init elanfreq_setup(char *str)
254{
255 max_freq = simple_strtoul(str, &str, 0);
256 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
257 return 1;
258}
259__setup("elanfreq=", elanfreq_setup);
260#endif
261
262
263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL,
266};
267
268
269static struct cpufreq_driver elanfreq_driver = {
270 .get = elanfreq_get_cpu_frequency,
271 .verify = elanfreq_verify,
272 .target = elanfreq_target,
273 .init = elanfreq_cpu_init,
274 .exit = elanfreq_cpu_exit,
275 .name = "elanfreq",
276 .owner = THIS_MODULE,
277 .attr = elanfreq_attr,
278};
279
280
281static int __init elanfreq_init(void)
282{
283 struct cpuinfo_x86 *c = &cpu_data(0);
284
285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV;
290 }
291 return cpufreq_register_driver(&elanfreq_driver);
292}
293
294
295static void __exit elanfreq_exit(void)
296{
297 cpufreq_unregister_driver(&elanfreq_driver);
298}
299
300
301module_param(max_freq, int, 0444);
302
303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
305 "Sven Geggus <sven@geggus.net>");
306MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
307
308module_init(elanfreq_init);
309module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf84232..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoretical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Modulation.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <linux/errno.h>
83#include <linux/slab.h>
84
85#include <asm/processor-cyrix.h>
86
87/* PCI config registers, all at F0 */
88#define PCI_PMER1 0x80 /* power management enable register 1 */
89#define PCI_PMER2 0x81 /* power management enable register 2 */
90#define PCI_PMER3 0x82 /* power management enable register 3 */
91#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
92#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
93#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
94#define PCI_MODON 0x95 /* suspend modulation ON counter register */
95#define PCI_SUSCFG 0x96 /* suspend configuration register */
96
97/* PMER1 bits */
98#define GPM (1<<0) /* global power management */
99#define GIT (1<<1) /* globally enable PM device idle timers */
100#define GTR (1<<2) /* globally enable IO traps */
101#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
102#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
103
104/* SUSCFG bits */
105#define SUSMOD (1<<0) /* enable/disable suspend modulation */
106/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
107#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
108 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
109#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
110/* the below is supported only with cs5530A */
111#define PWRSVE_ISA (1<<3) /* stop ISA clock */
112#define PWRSVE (1<<4) /* active idle */
113
114struct gxfreq_params {
115 u8 on_duration;
116 u8 off_duration;
117 u8 pci_suscfg;
118 u8 pci_pmer1;
119 u8 pci_pmer2;
120 struct pci_dev *cs55x0;
121};
122
123static struct gxfreq_params *gx_params;
124static int stock_freq;
125
126/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
127static int pci_busclk;
128module_param(pci_busclk, int, 0444);
129
130/* maximum duration for which the cpu may be suspended
131 * (32us * MAX_DURATION). If no parameter is given, this defaults
132 * to 255.
133 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
134 * is suspended -- processing power is just 0.39% of what it used to be,
135 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
136static int max_duration = 255;
137module_param(max_duration, int, 0444);
138
139/* For the default policy, we want at least some processing power
140 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
141 */
142#define POLICY_MIN_DIV 20
143
144
145#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
146 "gx-suspmod", msg)
147
148/**
149 * we can detect a core multipiler from dir0_lsb
150 * from GX1 datasheet p.56,
151 * MULT[3:0]:
152 * 0000 = SYSCLK multiplied by 4 (test only)
153 * 0001 = SYSCLK multiplied by 10
154 * 0010 = SYSCLK multiplied by 4
155 * 0011 = SYSCLK multiplied by 6
156 * 0100 = SYSCLK multiplied by 9
157 * 0101 = SYSCLK multiplied by 5
158 * 0110 = SYSCLK multiplied by 7
159 * 0111 = SYSCLK multiplied by 8
160 * of 33.3MHz
161 **/
162static int gx_freq_mult[16] = {
163 4, 10, 4, 6, 9, 5, 7, 8,
164 0, 0, 0, 0, 0, 0, 0, 0
165};
166
167
168/****************************************************************
169 * Low Level chipset interface *
170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 { 0, },
176};
177
178static void gx_write_byte(int reg, int value)
179{
180 pci_write_config_byte(gx_params->cs55x0, reg, value);
181}
182
183/**
184 * gx_detect_chipset:
185 *
186 **/
187static __init struct pci_dev *gx_detect_chipset(void)
188{
189 struct pci_dev *gx_pci = NULL;
190
191 /* check if CPU is a MediaGX or a Geode. */
192 if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
193 (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
194 dprintk("error: no MediaGX/Geode processor found!\n");
195 return NULL;
196 }
197
198 /* detect which companion chip is used */
199 for_each_pci_dev(gx_pci) {
200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
201 return gx_pci;
202 }
203
204 dprintk("error: no supported chipset found!\n");
205 return NULL;
206}
207
208/**
209 * gx_get_cpuspeed:
210 *
211 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
212 * Geode CPU runs.
213 */
214static unsigned int gx_get_cpuspeed(unsigned int cpu)
215{
216 if ((gx_params->pci_suscfg & SUSMOD) == 0)
217 return stock_freq;
218
219 return (stock_freq * gx_params->off_duration)
220 / (gx_params->on_duration + gx_params->off_duration);
221}
222
223/**
224 * gx_validate_speed:
225 * determine current cpu speed
226 *
227 **/
228
229static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
230 u8 *off_duration)
231{
232 unsigned int i;
233 u8 tmp_on, tmp_off;
234 int old_tmp_freq = stock_freq;
235 int tmp_freq;
236
237 *off_duration = 1;
238 *on_duration = 0;
239
240 for (i = max_duration; i > 0; i--) {
241 tmp_off = ((khz * i) / stock_freq) & 0xff;
242 tmp_on = i - tmp_off;
243 tmp_freq = (stock_freq * tmp_off) / i;
244 /* if this relation is closer to khz, use this. If it's equal,
245 * prefer it, too - lower latency */
246 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
247 *on_duration = tmp_on;
248 *off_duration = tmp_off;
249 old_tmp_freq = tmp_freq;
250 }
251 }
252
253 return old_tmp_freq;
254}
255
256
257/**
258 * gx_set_cpuspeed:
259 * set cpu speed in khz.
260 **/
261
262static void gx_set_cpuspeed(unsigned int khz)
263{
264 u8 suscfg, pmer1;
265 unsigned int new_khz;
266 unsigned long flags;
267 struct cpufreq_freqs freqs;
268
269 freqs.cpu = 0;
270 freqs.old = gx_get_cpuspeed(0);
271
272 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
273 &gx_params->off_duration);
274
275 freqs.new = new_khz;
276
277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
278 local_irq_save(flags);
279
280
281
282 if (new_khz != stock_freq) {
283 /* if new khz == 100% of CPU speed, it is special case */
284 switch (gx_params->cs55x0->device) {
285 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
286 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
287 /* FIXME: need to test other values -- Zwane,Miura */
288 /* typical 2 to 4ms */
289 gx_write_byte(PCI_IRQTC, 4);
290 /* typical 50 to 100ms */
291 gx_write_byte(PCI_VIDTC, 100);
292 gx_write_byte(PCI_PMER1, pmer1);
293
294 if (gx_params->cs55x0->revision < 0x10) {
295 /* CS5530(rev 1.2, 1.3) */
296 suscfg = gx_params->pci_suscfg|SUSMOD;
297 } else {
298 /* CS5530A,B.. */
299 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
300 }
301 break;
302 case PCI_DEVICE_ID_CYRIX_5520:
303 case PCI_DEVICE_ID_CYRIX_5510:
304 suscfg = gx_params->pci_suscfg | SUSMOD;
305 break;
306 default:
307 local_irq_restore(flags);
308 dprintk("fatal: try to set unknown chipset.\n");
309 return;
310 }
311 } else {
312 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
313 gx_params->off_duration = 0;
314 gx_params->on_duration = 0;
315 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
316 }
317
318 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
319 gx_write_byte(PCI_MODON, gx_params->on_duration);
320
321 gx_write_byte(PCI_SUSCFG, suscfg);
322 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
323
324 local_irq_restore(flags);
325
326 gx_params->pci_suscfg = suscfg;
327
328 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
329
330 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
331 gx_params->on_duration * 32, gx_params->off_duration * 32);
332 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
333}
334
335/****************************************************************
336 * High level functions *
337 ****************************************************************/
338
339/*
340 * cpufreq_gx_verify: test if frequency range is valid
341 *
342 * This function checks if a given frequency range in kHz is valid
343 * for the hardware supported by the driver.
344 */
345
346static int cpufreq_gx_verify(struct cpufreq_policy *policy)
347{
348 unsigned int tmp_freq = 0;
349 u8 tmp1, tmp2;
350
351 if (!stock_freq || !policy)
352 return -EINVAL;
353
354 policy->cpu = 0;
355 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
356 stock_freq);
357
358 /* it needs to be assured that at least one supported frequency is
359 * within policy->min and policy->max. If it is not, policy->max
360 * needs to be increased until one freuqency is supported.
361 * policy->min may not be decreased, though. This way we guarantee a
362 * specific processing capacity.
363 */
364 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
365 if (tmp_freq < policy->min)
366 tmp_freq += stock_freq / max_duration;
367 policy->min = tmp_freq;
368 if (policy->min > policy->max)
369 policy->max = tmp_freq;
370 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
371 if (tmp_freq > policy->max)
372 tmp_freq -= stock_freq / max_duration;
373 policy->max = tmp_freq;
374 if (policy->max < policy->min)
375 policy->max = policy->min;
376 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
377 stock_freq);
378
379 return 0;
380}
381
382/*
383 * cpufreq_gx_target:
384 *
385 */
386static int cpufreq_gx_target(struct cpufreq_policy *policy,
387 unsigned int target_freq,
388 unsigned int relation)
389{
390 u8 tmp1, tmp2;
391 unsigned int tmp_freq;
392
393 if (!stock_freq || !policy)
394 return -EINVAL;
395
396 policy->cpu = 0;
397
398 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
399 while (tmp_freq < policy->min) {
400 tmp_freq += stock_freq / max_duration;
401 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
402 }
403 while (tmp_freq > policy->max) {
404 tmp_freq -= stock_freq / max_duration;
405 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
406 }
407
408 gx_set_cpuspeed(tmp_freq);
409
410 return 0;
411}
412
413static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
414{
415 unsigned int maxfreq, curfreq;
416
417 if (!policy || policy->cpu != 0)
418 return -ENODEV;
419
420 /* determine maximum frequency */
421 if (pci_busclk)
422 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
423 else if (cpu_khz)
424 maxfreq = cpu_khz;
425 else
426 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
427
428 stock_freq = maxfreq;
429 curfreq = gx_get_cpuspeed(0);
430
431 dprintk("cpu max frequency is %d.\n", maxfreq);
432 dprintk("cpu current frequency is %dkHz.\n", curfreq);
433
434 /* setup basic struct for cpufreq API */
435 policy->cpu = 0;
436
437 if (max_duration < POLICY_MIN_DIV)
438 policy->min = maxfreq / max_duration;
439 else
440 policy->min = maxfreq / POLICY_MIN_DIV;
441 policy->max = maxfreq;
442 policy->cur = curfreq;
443 policy->cpuinfo.min_freq = maxfreq / max_duration;
444 policy->cpuinfo.max_freq = maxfreq;
445 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
446
447 return 0;
448}
449
450/*
451 * cpufreq_gx_init:
452 * MediaGX/Geode GX initialize cpufreq driver
453 */
454static struct cpufreq_driver gx_suspmod_driver = {
455 .get = gx_get_cpuspeed,
456 .verify = cpufreq_gx_verify,
457 .target = cpufreq_gx_target,
458 .init = cpufreq_gx_cpu_init,
459 .name = "gx-suspmod",
460 .owner = THIS_MODULE,
461};
462
463static int __init cpufreq_gx_init(void)
464{
465 int ret;
466 struct gxfreq_params *params;
467 struct pci_dev *gx_pci;
468
469 /* Test if we have the right hardware */
470 gx_pci = gx_detect_chipset();
471 if (gx_pci == NULL)
472 return -ENODEV;
473
474 /* check whether module parameters are sane */
475 if (max_duration > 0xff)
476 max_duration = 0xff;
477
478 dprintk("geode suspend modulation available.\n");
479
480 params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
481 if (params == NULL)
482 return -ENOMEM;
483
484 params->cs55x0 = gx_pci;
485 gx_params = params;
486
487 /* keep cs55x0 configurations */
488 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
489 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
490 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
491 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
492 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
493 &(params->off_duration));
494
495 ret = cpufreq_register_driver(&gx_suspmod_driver);
496 if (ret) {
497 kfree(params);
498 return ret; /* register error! */
499 }
500
501 return 0;
502}
503
504static void __exit cpufreq_gx_exit(void)
505{
506 cpufreq_unregister_driver(&gx_suspmod_driver);
507 pci_dev_put(gx_params->cs55x0);
508 kfree(gx_params);
509}
510
511MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
512MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
513MODULE_LICENSE("GPL");
514
515module_init(cpufreq_gx_init);
516module_exit(cpufreq_gx_exit);
517
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index cf48cdd6907d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * Version 3 of longhaul got renamed to Powersaver and redesigned
15 * to use only the POWERSAVER MSR at 0x110a.
16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
17 * It's pretty much the same feature wise to longhaul v2, though
18 * there is provision for scaling FSB too, but this doesn't work
19 * too well in practice so we don't even try to use this.
20 *
21 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/init.h>
28#include <linux/cpufreq.h>
29#include <linux/pci.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36
37#include <asm/msr.h>
38#include <acpi/processor.h>
39
40#include "longhaul.h"
41
42#define PFX "longhaul: "
43
44#define TYPE_LONGHAUL_V1 1
45#define TYPE_LONGHAUL_V2 2
46#define TYPE_POWERSAVER 3
47
48#define CPU_SAMUEL 1
49#define CPU_SAMUEL2 2
50#define CPU_EZRA 3
51#define CPU_EZRA_T 4
52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54
55/* Flags */
56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2)
58
59static int cpu_model;
60static unsigned int numscales = 16;
61static unsigned int fsb;
62
63static const struct mV_pos *vrm_mV_table;
64static const unsigned char *mV_vrm_table;
65
66static unsigned int highest_speed, lowest_speed; /* kHz */
67static unsigned int minmult, maxmult;
68static int can_scale_voltage;
69static struct acpi_processor *pr;
70static struct acpi_processor_cx *cx;
71static u32 acpi_regs_addr;
72static u8 longhaul_flags;
73static unsigned int longhaul_index;
74
75/* Module parameters */
76static int scale_voltage;
77static int disable_acpi_c3;
78static int revid_errata;
79
80#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
81 "longhaul", msg)
82
83
84/* Clock ratios multiplied by 10 */
85static int mults[32];
86static int eblcr[32];
87static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table;
89
90#ifdef CONFIG_CPU_FREQ_DEBUG
91static char speedbuffer[8];
92
93static char *print_speed(int speed)
94{
95 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer;
98 }
99
100 if (speed%1000 == 0)
101 snprintf(speedbuffer, sizeof(speedbuffer),
102 "%dGHz", speed/1000);
103 else
104 snprintf(speedbuffer, sizeof(speedbuffer),
105 "%d.%dGHz", speed/1000, (speed%1000)/100);
106
107 return speedbuffer;
108}
109#endif
110
111
112static unsigned int calc_speed(int mult)
113{
114 int khz;
115 khz = (mult/10)*fsb;
116 if (mult%10)
117 khz += fsb/2;
118 khz *= 1000;
119 return khz;
120}
121
122
123static int longhaul_get_cpu_mult(void)
124{
125 unsigned long invalue = 0, lo, hi;
126
127 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version == TYPE_LONGHAUL_V2 ||
130 longhaul_version == TYPE_POWERSAVER) {
131 if (lo & (1<<27))
132 invalue += 16;
133 }
134 return eblcr[invalue];
135}
136
137/* For processor with BCR2 MSR */
138
139static void do_longhaul1(unsigned int mults_index)
140{
141 union msr_bcr2 bcr2;
142
143 rdmsrl(MSR_VIA_BCR2, bcr2.val);
144 /* Enable software clock multiplier */
145 bcr2.bits.ESOFTBF = 1;
146 bcr2.bits.CLOCKMUL = mults_index & 0xff;
147
148 /* Sync to timer tick */
149 safe_halt();
150 /* Change frequency on next halt or sleep */
151 wrmsrl(MSR_VIA_BCR2, bcr2.val);
152 /* Invoke transition */
153 ACPI_FLUSH_CPU_CACHE();
154 halt();
155
156 /* Disable software clock multiplier */
157 local_irq_disable();
158 rdmsrl(MSR_VIA_BCR2, bcr2.val);
159 bcr2.bits.ESOFTBF = 0;
160 wrmsrl(MSR_VIA_BCR2, bcr2.val);
161}
162
163/* For processor with Longhaul MSR */
164
165static void do_powersaver(int cx_address, unsigned int mults_index,
166 unsigned int dir)
167{
168 union msr_longhaul longhaul;
169 u32 t;
170
171 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
172 /* Setup new frequency */
173 if (!revid_errata)
174 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
175 else
176 longhaul.bits.RevisionKey = 0;
177 longhaul.bits.SoftBusRatio = mults_index & 0xf;
178 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
179 /* Setup new voltage */
180 if (can_scale_voltage)
181 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && dir) {
186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 }
203
204 /* Change frequency on next halt or sleep */
205 longhaul.bits.EnableSoftBusRatio = 1;
206 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
207 if (!cx_address) {
208 ACPI_FLUSH_CPU_CACHE();
209 halt();
210 } else {
211 ACPI_FLUSH_CPU_CACHE();
212 /* Invoke C3 */
213 inb(cx_address);
214 /* Dummy op - must do something useless after P_LVL3 read */
215 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
216 }
217 /* Disable bus ratio bit */
218 longhaul.bits.EnableSoftBusRatio = 0;
219 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
220
221 /* Reduce voltage if necessary */
222 if (can_scale_voltage && !dir) {
223 longhaul.bits.EnableSoftVID = 1;
224 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
225 /* Change voltage */
226 if (!cx_address) {
227 ACPI_FLUSH_CPU_CACHE();
228 halt();
229 } else {
230 ACPI_FLUSH_CPU_CACHE();
231 /* Invoke C3 */
232 inb(cx_address);
233 /* Dummy op - must do something useless after P_LVL3
234 * read */
235 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
236 }
237 longhaul.bits.EnableSoftVID = 0;
238 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
239 }
240}
241
242/**
243 * longhaul_set_cpu_frequency()
244 * @mults_index : bitpattern of the new multiplier.
245 *
246 * Sets a new clock ratio.
247 */
248
249static void longhaul_setstate(unsigned int table_index)
250{
251 unsigned int mults_index;
252 int speed, mult;
253 struct cpufreq_freqs freqs;
254 unsigned long flags;
255 unsigned int pic1_mask, pic2_mask;
256 u16 bm_status = 0;
257 u32 bm_timeout = 1000;
258 unsigned int dir = 0;
259
260 mults_index = longhaul_table[table_index].index;
261 /* Safety precautions */
262 mult = mults[mults_index & 0x1f];
263 if (mult == -1)
264 return;
265 speed = calc_speed(mult);
266 if ((speed > highest_speed) || (speed < lowest_speed))
267 return;
268 /* Voltage transition before frequency transition? */
269 if (can_scale_voltage && longhaul_index < table_index)
270 dir = 1;
271
272 freqs.old = calc_speed(longhaul_get_cpu_mult());
273 freqs.new = speed;
274 freqs.cpu = 0; /* longhaul.c is UP only driver */
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
279 fsb, mult/10, mult%10, print_speed(speed/1000));
280retry_loop:
281 preempt_disable();
282 local_irq_save(flags);
283
284 pic2_mask = inb(0xA1);
285 pic1_mask = inb(0x21); /* works on C3. save mask. */
286 outb(0xFF, 0xA1); /* Overkill */
287 outb(0xFE, 0x21); /* TMR0 only */
288
289 /* Wait while PCI bus is busy. */
290 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
291 || ((pr != NULL) && pr->flags.bm_control))) {
292 bm_status = inw(acpi_regs_addr);
293 bm_status &= 1 << 4;
294 while (bm_status && bm_timeout) {
295 outw(1 << 4, acpi_regs_addr);
296 bm_timeout--;
297 bm_status = inw(acpi_regs_addr);
298 bm_status &= 1 << 4;
299 }
300 }
301
302 if (longhaul_flags & USE_NORTHBRIDGE) {
303 /* Disable AGP and PCI arbiters */
304 outb(3, 0x22);
305 } else if ((pr != NULL) && pr->flags.bm_control) {
306 /* Disable bus master arbitration */
307 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
308 }
309 switch (longhaul_version) {
310
311 /*
312 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
313 * Software controlled multipliers only.
314 */
315 case TYPE_LONGHAUL_V1:
316 do_longhaul1(mults_index);
317 break;
318
319 /*
320 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
321 *
322 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
323 * Nehemiah can do FSB scaling too, but this has never been proven
324 * to work in practice.
325 */
326 case TYPE_LONGHAUL_V2:
327 case TYPE_POWERSAVER:
328 if (longhaul_flags & USE_ACPI_C3) {
329 /* Don't allow wakeup */
330 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
331 do_powersaver(cx->address, mults_index, dir);
332 } else {
333 do_powersaver(0, mults_index, dir);
334 }
335 break;
336 }
337
338 if (longhaul_flags & USE_NORTHBRIDGE) {
339 /* Enable arbiters */
340 outb(0, 0x22);
341 } else if ((pr != NULL) && pr->flags.bm_control) {
342 /* Enable bus master arbitration */
343 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
344 }
345 outb(pic2_mask, 0xA1); /* restore mask */
346 outb(pic1_mask, 0x21);
347
348 local_irq_restore(flags);
349 preempt_enable();
350
351 freqs.new = calc_speed(longhaul_get_cpu_mult());
352 /* Check if requested frequency is set. */
353 if (unlikely(freqs.new != speed)) {
354 printk(KERN_INFO PFX "Failed to set requested frequency!\n");
355 /* Revision ID = 1 but processor is expecting revision key
356 * equal to 0. Jumpers at the bottom of processor will change
357 * multiplier and FSB, but will not change bits in Longhaul
358 * MSR nor enable voltage scaling. */
359 if (!revid_errata) {
360 printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
361 "option.\n");
362 revid_errata = 1;
363 msleep(200);
364 goto retry_loop;
365 }
366 /* Why ACPI C3 sometimes doesn't work is a mystery for me.
367 * But it does happen. Processor is entering ACPI C3 state,
368 * but it doesn't change frequency. I tried poking various
369 * bits in northbridge registers, but without success. */
370 if (longhaul_flags & USE_ACPI_C3) {
371 printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
372 longhaul_flags &= ~USE_ACPI_C3;
373 if (revid_errata) {
374 printk(KERN_INFO PFX "Disabling \"Ignore "
375 "Revision ID\" option.\n");
376 revid_errata = 0;
377 }
378 msleep(200);
379 goto retry_loop;
380 }
381 /* This shouldn't happen. Longhaul ver. 2 was reported not
382 * working on processors without voltage scaling, but with
383 * RevID = 1. RevID errata will make things right. Just
384 * to be 100% sure. */
385 if (longhaul_version == TYPE_LONGHAUL_V2) {
386 printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
387 longhaul_version = TYPE_LONGHAUL_V1;
388 msleep(200);
389 goto retry_loop;
390 }
391 }
392 /* Report true CPU frequency */
393 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
394
395 if (!bm_timeout)
396 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
397 "idle PCI bus.\n");
398}
399
400/*
401 * Centaur decided to make life a little more tricky.
402 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
403 * Samuel2 and above have to try and guess what the FSB is.
404 * We do this by assuming we booted at maximum multiplier, and interpolate
405 * between that value multiplied by possible FSBs and cpu_mhz which
406 * was calculated at boot time. Really ugly, but no other way to do this.
407 */
408
409#define ROUNDING 0xf
410
411static int guess_fsb(int mult)
412{
413 int speed = cpu_khz / 1000;
414 int i;
415 int speeds[] = { 666, 1000, 1333, 2000 };
416 int f_max, f_min;
417
418 for (i = 0; i < 4; i++) {
419 f_max = ((speeds[i] * mult) + 50) / 100;
420 f_max += (ROUNDING / 2);
421 f_min = f_max - ROUNDING;
422 if ((speed <= f_max) && (speed >= f_min))
423 return speeds[i] / 10;
424 }
425 return 0;
426}
427
428
429static int __cpuinit longhaul_get_ranges(void)
430{
431 unsigned int i, j, k = 0;
432 unsigned int ratio;
433 int mult;
434
435 /* Get current frequency */
436 mult = longhaul_get_cpu_mult();
437 if (mult == -1) {
438 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
439 return -EINVAL;
440 }
441 fsb = guess_fsb(mult);
442 if (fsb == 0) {
443 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
444 return -EINVAL;
445 }
446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */
449 maxmult = mult;
450 /* Get min multiplier */
451 switch (cpu_model) {
452 case CPU_NEHEMIAH:
453 minmult = 50;
454 break;
455 case CPU_NEHEMIAH_C:
456 minmult = 40;
457 break;
458 default:
459 minmult = 30;
460 break;
461 }
462
463 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
464 minmult/10, minmult%10, maxmult/10, maxmult%10);
465
466 highest_speed = calc_speed(maxmult);
467 lowest_speed = calc_speed(minmult);
468 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
469 print_speed(lowest_speed/1000),
470 print_speed(highest_speed/1000));
471
472 if (lowest_speed == highest_speed) {
473 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
474 return -EINVAL;
475 }
476 if (lowest_speed > highest_speed) {
477 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
478 lowest_speed, highest_speed);
479 return -EINVAL;
480 }
481
482 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
483 GFP_KERNEL);
484 if (!longhaul_table)
485 return -ENOMEM;
486
487 for (j = 0; j < numscales; j++) {
488 ratio = mults[j];
489 if (ratio == -1)
490 continue;
491 if (ratio > maxmult || ratio < minmult)
492 continue;
493 longhaul_table[k].frequency = calc_speed(ratio);
494 longhaul_table[k].index = j;
495 k++;
496 }
497 if (k <= 1) {
498 kfree(longhaul_table);
499 return -ENODEV;
500 }
501 /* Sort */
502 for (j = 0; j < k - 1; j++) {
503 unsigned int min_f, min_i;
504 min_f = longhaul_table[j].frequency;
505 min_i = j;
506 for (i = j + 1; i < k; i++) {
507 if (longhaul_table[i].frequency < min_f) {
508 min_f = longhaul_table[i].frequency;
509 min_i = i;
510 }
511 }
512 if (min_i != j) {
513 swap(longhaul_table[j].frequency,
514 longhaul_table[min_i].frequency);
515 swap(longhaul_table[j].index,
516 longhaul_table[min_i].index);
517 }
518 }
519
520 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
521
522 /* Find index we are running on */
523 for (j = 0; j < k; j++) {
524 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j;
526 break;
527 }
528 }
529 return 0;
530}
531
532
533static void __cpuinit longhaul_setup_voltagescaling(void)
534{
535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid;
537 unsigned int j, speed, pos, kHz_step, numvscales;
538 int min_vid_speed;
539
540 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
541 if (!(longhaul.bits.RevisionID & 1)) {
542 printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
543 return;
544 }
545
546 if (!longhaul.bits.VRMRev) {
547 printk(KERN_INFO PFX "VRM 8.5\n");
548 vrm_mV_table = &vrm85_mV[0];
549 mV_vrm_table = &mV_vrm85[0];
550 } else {
551 printk(KERN_INFO PFX "Mobile VRM\n");
552 if (cpu_model < CPU_NEHEMIAH)
553 return;
554 vrm_mV_table = &mobilevrm_mV[0];
555 mV_vrm_table = &mV_mobilevrm[0];
556 }
557
558 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000,
565 maxvid.mV/1000, maxvid.mV%1000);
566 return;
567 }
568
569 if (minvid.mV == maxvid.mV) {
570 printk(KERN_INFO PFX "Claims to support voltage scaling but "
571 "min & max are both %d.%03d. "
572 "Voltage scaling disabled\n",
573 maxvid.mV/1000, maxvid.mV%1000);
574 return;
575 }
576
577 /* How many voltage steps*/
578 numvscales = maxvid.pos - minvid.pos + 1;
579 printk(KERN_INFO PFX
580 "Max VID=%d.%03d "
581 "Min VID=%d.%03d, "
582 "%d possible voltage scales\n",
583 maxvid.mV/1000, maxvid.mV%1000,
584 minvid.mV/1000, minvid.mV%1000,
585 numvscales);
586
587 /* Calculate max frequency at min voltage */
588 j = longhaul.bits.MinMHzBR;
589 if (longhaul.bits.MinMHzBR4)
590 j += 16;
591 min_vid_speed = eblcr[j];
592 if (min_vid_speed == -1)
593 return;
594 switch (longhaul.bits.MinMHzFSB) {
595 case 0:
596 min_vid_speed *= 13333;
597 break;
598 case 1:
599 min_vid_speed *= 10000;
600 break;
601 case 3:
602 min_vid_speed *= 6666;
603 break;
604 default:
605 return;
606 break;
607 }
608 if (min_vid_speed >= highest_speed)
609 return;
610 /* Calculate kHz for one voltage step */
611 kHz_step = (highest_speed - min_vid_speed) / numvscales;
612
613 j = 0;
614 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
615 speed = longhaul_table[j].frequency;
616 if (speed > min_vid_speed)
617 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
618 else
619 pos = minvid.pos;
620 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
621 vid = vrm_mV_table[mV_vrm_table[pos]];
622 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
623 speed, j, vid.mV);
624 j++;
625 }
626
627 can_scale_voltage = 1;
628 printk(KERN_INFO PFX "Voltage scaling enabled.\n");
629}
630
631
632static int longhaul_verify(struct cpufreq_policy *policy)
633{
634 return cpufreq_frequency_table_verify(policy, longhaul_table);
635}
636
637
638static int longhaul_target(struct cpufreq_policy *policy,
639 unsigned int target_freq, unsigned int relation)
640{
641 unsigned int table_index = 0;
642 unsigned int i;
643 unsigned int dir = 0;
644 u8 vid, current_vid;
645
646 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
647 relation, &table_index))
648 return -EINVAL;
649
650 /* Don't set same frequency again */
651 if (longhaul_index == table_index)
652 return 0;
653
654 if (!can_scale_voltage)
655 longhaul_setstate(table_index);
656 else {
657 /* On test system voltage transitions exceeding single
658 * step up or down were turning motherboard off. Both
659 * "ondemand" and "userspace" are unsafe. C7 is doing
660 * this in hardware, C3 is old and we need to do this
661 * in software. */
662 i = longhaul_index;
663 current_vid = (longhaul_table[longhaul_index].index >> 8);
664 current_vid &= 0x1f;
665 if (table_index > longhaul_index)
666 dir = 1;
667 while (i != table_index) {
668 vid = (longhaul_table[i].index >> 8) & 0x1f;
669 if (vid != current_vid) {
670 longhaul_setstate(i);
671 current_vid = vid;
672 msleep(200);
673 }
674 if (dir)
675 i++;
676 else
677 i--;
678 }
679 longhaul_setstate(table_index);
680 }
681 longhaul_index = table_index;
682 return 0;
683}
684
685
686static unsigned int longhaul_get(unsigned int cpu)
687{
688 if (cpu)
689 return 0;
690 return calc_speed(longhaul_get_cpu_mult());
691}
692
693static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 u32 nesting_level,
695 void *context, void **return_value)
696{
697 struct acpi_device *d;
698
699 if (acpi_bus_get_device(obj_handle, &d))
700 return 0;
701
702 *return_value = acpi_driver_data(d);
703 return 1;
704}
705
706/* VIA don't support PM2 reg, but have something similar */
707static int enable_arbiter_disable(void)
708{
709 struct pci_dev *dev;
710 int status = 1;
711 int reg;
712 u8 pci_cmd;
713
714 /* Find PLE133 host bridge */
715 reg = 0x78;
716 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
717 NULL);
718 /* Find PM133/VT8605 host bridge */
719 if (dev == NULL)
720 dev = pci_get_device(PCI_VENDOR_ID_VIA,
721 PCI_DEVICE_ID_VIA_8605_0, NULL);
722 /* Find CLE266 host bridge */
723 if (dev == NULL) {
724 reg = 0x76;
725 dev = pci_get_device(PCI_VENDOR_ID_VIA,
726 PCI_DEVICE_ID_VIA_862X_0, NULL);
727 /* Find CN400 V-Link host bridge */
728 if (dev == NULL)
729 dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
730 }
731 if (dev != NULL) {
732 /* Enable access to port 0x22 */
733 pci_read_config_byte(dev, reg, &pci_cmd);
734 if (!(pci_cmd & 1<<7)) {
735 pci_cmd |= 1<<7;
736 pci_write_config_byte(dev, reg, pci_cmd);
737 pci_read_config_byte(dev, reg, &pci_cmd);
738 if (!(pci_cmd & 1<<7)) {
739 printk(KERN_ERR PFX
740 "Can't enable access to port 0x22.\n");
741 status = 0;
742 }
743 }
744 pci_dev_put(dev);
745 return status;
746 }
747 return 0;
748}
749
750static int longhaul_setup_southbridge(void)
751{
752 struct pci_dev *dev;
753 u8 pci_cmd;
754
755 /* Find VT8235 southbridge */
756 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
757 if (dev == NULL)
758 /* Find VT8237 southbridge */
759 dev = pci_get_device(PCI_VENDOR_ID_VIA,
760 PCI_DEVICE_ID_VIA_8237, NULL);
761 if (dev != NULL) {
762 /* Set transition time to max */
763 pci_read_config_byte(dev, 0xec, &pci_cmd);
764 pci_cmd &= ~(1 << 2);
765 pci_write_config_byte(dev, 0xec, pci_cmd);
766 pci_read_config_byte(dev, 0xe4, &pci_cmd);
767 pci_cmd &= ~(1 << 7);
768 pci_write_config_byte(dev, 0xe4, pci_cmd);
769 pci_read_config_byte(dev, 0xe5, &pci_cmd);
770 pci_cmd |= 1 << 7;
771 pci_write_config_byte(dev, 0xe5, pci_cmd);
772 /* Get address of ACPI registers block*/
773 pci_read_config_byte(dev, 0x81, &pci_cmd);
774 if (pci_cmd & 1 << 7) {
775 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
776 acpi_regs_addr &= 0xff00;
777 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
778 acpi_regs_addr);
779 }
780
781 pci_dev_put(dev);
782 return 1;
783 }
784 return 0;
785}
786
787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{
789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL;
791 int ret;
792 u32 lo, hi;
793
794 /* Check what we have on this motherboard */
795 switch (c->x86_model) {
796 case 6:
797 cpu_model = CPU_SAMUEL;
798 cpuname = "C3 'Samuel' [C5A]";
799 longhaul_version = TYPE_LONGHAUL_V1;
800 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
801 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
802 break;
803
804 case 7:
805 switch (c->x86_mask) {
806 case 0:
807 longhaul_version = TYPE_LONGHAUL_V1;
808 cpu_model = CPU_SAMUEL2;
809 cpuname = "C3 'Samuel 2' [C5B]";
810 /* Note, this is not a typo, early Samuel2's had
811 * Samuel1 ratios. */
812 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break;
815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]";
820 } else {
821 cpu_model = CPU_EZRA;
822 cpuname = "C3 'Ezra' [C5C]";
823 }
824 memcpy(mults, ezra_mults, sizeof(ezra_mults));
825 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
826 break;
827 }
828 break;
829
830 case 8:
831 cpu_model = CPU_EZRA_T;
832 cpuname = "C3 'Ezra-T' [C5M]";
833 longhaul_version = TYPE_POWERSAVER;
834 numscales = 32;
835 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
836 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
837 break;
838
839 case 9:
840 longhaul_version = TYPE_POWERSAVER;
841 numscales = 32;
842 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
843 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) {
845 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH;
847 cpuname = "C3 'Nehemiah A' [C5XLOE]";
848 break;
849 case 2 ... 4:
850 cpu_model = CPU_NEHEMIAH;
851 cpuname = "C3 'Nehemiah B' [C5XLOH]";
852 break;
853 case 5 ... 15:
854 cpu_model = CPU_NEHEMIAH_C;
855 cpuname = "C3 'Nehemiah C' [C5P]";
856 break;
857 }
858 break;
859
860 default:
861 cpuname = "Unknown";
862 break;
863 }
864 /* Check Longhaul ver. 2 */
865 if (longhaul_version == TYPE_LONGHAUL_V2) {
866 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
867 if (lo == 0 && hi == 0)
868 /* Looks like MSR isn't present */
869 longhaul_version = TYPE_LONGHAUL_V1;
870 }
871
872 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2:
876 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break;
878 case TYPE_POWERSAVER:
879 printk(KERN_CONT "Powersaver supported.\n");
880 break;
881 };
882
883 /* Doesn't hurt */
884 longhaul_setup_southbridge();
885
886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr);
890
891 /* Check ACPI support for C3 state */
892 if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
893 cx = &pr->power.states[ACPI_STATE_C3];
894 if (cx->address > 0 && cx->latency <= 1000)
895 longhaul_flags |= USE_ACPI_C3;
896 }
897 /* Disable if it isn't working */
898 if (disable_acpi_c3)
899 longhaul_flags &= ~USE_ACPI_C3;
900 /* Check if northbridge is friendly */
901 if (enable_arbiter_disable())
902 longhaul_flags |= USE_NORTHBRIDGE;
903
904 /* Check ACPI support for bus master arbiter disable */
905 if (!(longhaul_flags & USE_ACPI_C3
906 || longhaul_flags & USE_NORTHBRIDGE)
907 && ((pr == NULL) || !(pr->flags.bm_control))) {
908 printk(KERN_ERR PFX
909 "No ACPI support. Unsupported northbridge.\n");
910 return -ENODEV;
911 }
912
913 if (longhaul_flags & USE_NORTHBRIDGE)
914 printk(KERN_INFO PFX "Using northbridge support.\n");
915 if (longhaul_flags & USE_ACPI_C3)
916 printk(KERN_INFO PFX "Using ACPI support.\n");
917
918 ret = longhaul_get_ranges();
919 if (ret != 0)
920 return ret;
921
922 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
923 longhaul_setup_voltagescaling();
924
925 policy->cpuinfo.transition_latency = 200000; /* nsec */
926 policy->cur = calc_speed(longhaul_get_cpu_mult());
927
928 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
929 if (ret)
930 return ret;
931
932 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
933
934 return 0;
935}
936
937static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
938{
939 cpufreq_frequency_table_put_attr(policy->cpu);
940 return 0;
941}
942
943static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL,
946};
947
948static struct cpufreq_driver longhaul_driver = {
949 .verify = longhaul_verify,
950 .target = longhaul_target,
951 .get = longhaul_get,
952 .init = longhaul_cpu_init,
953 .exit = __devexit_p(longhaul_cpu_exit),
954 .name = "longhaul",
955 .owner = THIS_MODULE,
956 .attr = longhaul_attr,
957};
958
959
960static int __init longhaul_init(void)
961{
962 struct cpuinfo_x86 *c = &cpu_data(0);
963
964 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
965 return -ENODEV;
966
967#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, "
970 "longhaul disabled.\n");
971 return -ENODEV;
972 }
973#endif
974#ifdef CONFIG_X86_IO_APIC
975 if (cpu_has_apic) {
976 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
977 "broken in this configuration.\n");
978 return -ENODEV;
979 }
980#endif
981 switch (c->x86_model) {
982 case 6 ... 9:
983 return cpufreq_register_driver(&longhaul_driver);
984 case 10:
985 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
986 default:
987 ;
988 }
989
990 return -ENODEV;
991}
992
993
994static void __exit longhaul_exit(void)
995{
996 int i;
997
998 for (i = 0; i < numscales; i++) {
999 if (mults[i] == maxmult) {
1000 longhaul_setstate(i);
1001 break;
1002 }
1003 }
1004
1005 cpufreq_unregister_driver(&longhaul_driver);
1006 kfree(longhaul_table);
1007}
1008
1009/* Even if BIOS is exporting ACPI C3 state, and it is used
1010 * with success when CPU is idle, this state doesn't
1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1018/* Force revision key to 0 for processors which doesn't
1019 * support voltage scaling, but are introducing itself as
1020 * such. */
1021module_param(revid_errata, int, 0644);
1022MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1023
1024MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1025MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1026MODULE_LICENSE("GPL");
1027
1028late_initcall(longhaul_init);
1029module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca881..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr values specify the ratio read from the CPU.
53 * The mults values specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 -1, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 45, /* 0110 -> 4.5x */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 -1, /* 0000 -> 10.0x */
256 110, /* 0001 -> 11.0x */
257 -1, /* 0010 -> 12.0x */
258 -1, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 -1, /* 1111 -> 12.0x */
271};
272
273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */
277 100, /* 0011 -> 10.0x */
278 55, /* 0100 -> 5.5x */
279 -1, /* 0101 -> RESERVED */
280 45, /* 0110 -> 4.5x */
281 95, /* 0111 -> 9.5x */
282 90, /* 1000 -> 9.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 60, /* 1011 -> 6.0x */
286 120, /* 1100 -> 12.0x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 65, /* 1111 -> 6.5x */
290 90, /* 0000 -> 9.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 100, /* 0011 -> 10.0x */
294 135, /* 0100 -> 13.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 105, /* 0111 -> 10.5x */
298 130, /* 1000 -> 13.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 140, /* 1011 -> 14.0x */
302 120, /* 1100 -> 12.0x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 145 /* 1111 -> 14.5x */
306};
307
308/*
309 * Voltage scales. Div/Mod by 1000 to get actual voltage.
310 * Which scale to use depends on the VRM type in use.
311 */
312
313struct mV_pos {
314 unsigned short mV;
315 unsigned short pos;
316};
317
318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
322 {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
323 {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
324 {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
325 {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327};
328
329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334};
335
336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
340 {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
341 {975, 15}, {950, 14}, {925, 13}, {900, 12},
342 {875, 11}, {850, 10}, {825, 9}, {800, 8},
343 {775, 7}, {750, 6}, {725, 5}, {700, 4},
344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345};
346
347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
351 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
352};
353
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index d9f51367666b..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/timex.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17
18#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
19 "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return eax * 1000;
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = &cpu_data(0);
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n",
200 *low_freq, *high_freq);
201
202 if (*low_freq > *high_freq)
203 *low_freq = *high_freq;
204 return 0;
205 }
206
207 /* set the upper border to the value determined during TSC init */
208 *high_freq = (cpu_khz / 1000);
209 *high_freq = *high_freq * 1000;
210 dprintk("high frequency is %u kHz\n", *high_freq);
211
212 /* get current borders */
213 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
214 save_lo = msr_lo & 0x0000007F;
215 save_hi = msr_hi & 0x0000007F;
216
217 /* if current perf_pctg is larger than 90%, we need to decrease the
218 * upper limit to make the calculation more accurate.
219 */
220 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
221 /* try decreasing in 10% steps, some processors react only
222 * on some barrier values */
223 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
224 /* set to 0 to try_hi perf_pctg */
225 msr_lo &= 0xFFFFFF80;
226 msr_hi &= 0xFFFFFF80;
227 msr_hi |= try_hi;
228 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
229
230 /* read out current core MHz and current perf_pctg */
231 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
232
233 /* restore values */
234 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
235 }
236 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
237
238 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
239 * eqals
240 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
241 *
242 * high_freq * perf_pctg is stored tempoarily into "ebx".
243 */
244 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
245
246 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
247 return -EIO;
248
249 edx = ((eax - ebx) * 100) / (100 - ecx);
250 *low_freq = edx * 1000; /* back to kHz */
251
252 dprintk("low frequency is %u kHz\n", *low_freq);
253
254 if (*low_freq > *high_freq)
255 *low_freq = *high_freq;
256
257 return 0;
258}
259
260
261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{
263 int result = 0;
264
265 /* capability check */
266 if (policy->cpu != 0)
267 return -ENODEV;
268
269 /* detect low and high frequency */
270 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
271 if (result)
272 return result;
273
274 /* cpuinfo and default policy values */
275 policy->cpuinfo.min_freq = longrun_low_freq;
276 policy->cpuinfo.max_freq = longrun_high_freq;
277 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
278 longrun_get_policy(policy);
279
280 return 0;
281}
282
283
284static struct cpufreq_driver longrun_driver = {
285 .flags = CPUFREQ_CONST_LOOPS,
286 .verify = longrun_verify_policy,
287 .setpolicy = longrun_set_policy,
288 .get = longrun_get,
289 .init = longrun_cpu_init,
290 .name = "longrun",
291 .owner = THIS_MODULE,
292};
293
294
295/**
296 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
297 *
298 * Initializes the LongRun support.
299 */
300static int __init longrun_init(void)
301{
302 struct cpuinfo_x86 *c = &cpu_data(0);
303
304 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
305 !cpu_has(c, X86_FEATURE_LONGRUN))
306 return -ENODEV;
307
308 return cpufreq_register_driver(&longrun_driver);
309}
310
311
312/**
313 * longrun_exit - unregisters LongRun support
314 */
315static void __exit longrun_exit(void)
316{
317 cpufreq_unregister_driver(&longrun_driver);
318}
319
320
321MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
322MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
323 "Efficeon processors.");
324MODULE_LICENSE("GPL");
325
326module_init(longrun_init);
327module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 52c93648e492..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/smp.h>
27#include <linux/cpufreq.h>
28#include <linux/cpumask.h>
29#include <linux/timex.h>
30
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/timer.h>
34
35#include "speedstep-lib.h"
36
37#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
39 "p4-clockmod", msg)
40
41/*
42 * Duty Cycle (3bits), note DC_DISABLE is not specified in
43 * intel docs i just use it to mean disable
44 */
45enum {
46 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
47 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
48};
49
50#define DC_ENTRIES 8
51
52
53static int has_N44_O17_errata[NR_CPUS];
54static unsigned int stock_freq;
55static struct cpufreq_driver p4clockmod_driver;
56static unsigned int cpufreq_p4_get(unsigned int cpu);
57
58static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
59{
60 u32 l, h;
61
62 if (!cpu_online(cpu) ||
63 (newstate > DC_DISABLE) || (newstate == DC_RESV))
64 return -EINVAL;
65
66 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
67
68 if (l & 0x01)
69 dprintk("CPU#%d currently thermal throttled\n", cpu);
70
71 if (has_N44_O17_errata[cpu] &&
72 (newstate == DC_25PT || newstate == DC_DFLT))
73 newstate = DC_38PT;
74
75 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
76 if (newstate == DC_DISABLE) {
77 dprintk("CPU#%d disabling modulation\n", cpu);
78 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
79 } else {
80 dprintk("CPU#%d setting duty cycle to %d%%\n",
81 cpu, ((125 * newstate) / 10));
82 /* bits 63 - 5 : reserved
83 * bit 4 : enable/disable
84 * bits 3-1 : duty cycle
85 * bit 0 : reserved
86 */
87 l = (l & ~14);
88 l = l | (1<<4) | ((newstate & 0x7)<<1);
89 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
90 }
91
92 return 0;
93}
94
95
96static struct cpufreq_frequency_table p4clockmod_table[] = {
97 {DC_RESV, CPUFREQ_ENTRY_INVALID},
98 {DC_DFLT, 0},
99 {DC_25PT, 0},
100 {DC_38PT, 0},
101 {DC_50PT, 0},
102 {DC_64PT, 0},
103 {DC_75PT, 0},
104 {DC_88PT, 0},
105 {DC_DISABLE, 0},
106 {DC_RESV, CPUFREQ_TABLE_END},
107};
108
109
110static int cpufreq_p4_target(struct cpufreq_policy *policy,
111 unsigned int target_freq,
112 unsigned int relation)
113{
114 unsigned int newstate = DC_RESV;
115 struct cpufreq_freqs freqs;
116 int i;
117
118 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
119 target_freq, relation, &newstate))
120 return -EINVAL;
121
122 freqs.old = cpufreq_p4_get(policy->cpu);
123 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
124
125 if (freqs.new == freqs.old)
126 return 0;
127
128 /* notifiers */
129 for_each_cpu(i, policy->cpus) {
130 freqs.cpu = i;
131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
132 }
133
134 /* run on each logical CPU,
135 * see section 13.15.3 of IA32 Intel Architecture Software
136 * Developer's Manual, Volume 3
137 */
138 for_each_cpu(i, policy->cpus)
139 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
140
141 /* notifiers */
142 for_each_cpu(i, policy->cpus) {
143 freqs.cpu = i;
144 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
145 }
146
147 return 0;
148}
149
150
151static int cpufreq_p4_verify(struct cpufreq_policy *policy)
152{
153 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
154}
155
156
157static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{
159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST))
161 printk_once(KERN_WARNING PFX "Warning: EST-capable "
162 "CPU detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition to frequency "
164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) {
167 case 0x0E: /* Core */
168 case 0x0F: /* Core Duo */
169 case 0x16: /* Celeron Core */
170 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */
174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
175 /* fall through */
176 case 0x09: /* Pentium M (Banias) */
177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
178 }
179 }
180
181 if (c->x86 != 0xF)
182 return 0;
183
184 /* on P-4s, the TSC runs with constant frequency independent whether
185 * throttling is active or not. */
186 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
187
188 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
189 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
190 "The speedstep-ich or acpi cpufreq modules offer "
191 "voltage scaling in addition of frequency scaling. "
192 "You should use either one instead of p4-clockmod, "
193 "if possible.\n");
194 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
195 }
196
197 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
198}
199
200
201
202static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203{
204 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
205 int cpuid = 0;
206 unsigned int i;
207
208#ifdef CONFIG_SMP
209 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
210#endif
211
212 /* Errata workaround */
213 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
214 switch (cpuid) {
215 case 0x0f07:
216 case 0x0f0a:
217 case 0x0f11:
218 case 0x0f12:
219 has_N44_O17_errata[policy->cpu] = 1;
220 dprintk("has errata -- disabling low frequencies\n");
221 }
222
223 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
224 c->x86_model < 2) {
225 /* switch to maximum frequency and measure result */
226 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
227 recalibrate_cpu_khz();
228 }
229 /* get max frequency */
230 stock_freq = cpufreq_p4_get_frequency(c);
231 if (!stock_freq)
232 return -EINVAL;
233
234 /* table init */
235 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
236 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
237 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
238 else
239 p4clockmod_table[i].frequency = (stock_freq * i)/8;
240 }
241 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
242
243 /* cpuinfo and default policy values */
244
245 /* the transition latency is set to be 1 higher than the maximum
246 * transition latency of the ondemand governor */
247 policy->cpuinfo.transition_latency = 10000001;
248 policy->cur = stock_freq;
249
250 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
251}
252
253
254static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
255{
256 cpufreq_frequency_table_put_attr(policy->cpu);
257 return 0;
258}
259
260static unsigned int cpufreq_p4_get(unsigned int cpu)
261{
262 u32 l, h;
263
264 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
265
266 if (l & 0x10) {
267 l = l >> 1;
268 l &= 0x7;
269 } else
270 l = DC_DISABLE;
271
272 if (l != DC_DISABLE)
273 return stock_freq * l / 8;
274
275 return stock_freq;
276}
277
278static struct freq_attr *p4clockmod_attr[] = {
279 &cpufreq_freq_attr_scaling_available_freqs,
280 NULL,
281};
282
283static struct cpufreq_driver p4clockmod_driver = {
284 .verify = cpufreq_p4_verify,
285 .target = cpufreq_p4_target,
286 .init = cpufreq_p4_cpu_init,
287 .exit = cpufreq_p4_cpu_exit,
288 .get = cpufreq_p4_get,
289 .name = "p4-clockmod",
290 .owner = THIS_MODULE,
291 .attr = p4clockmod_attr,
292};
293
294
295static int __init cpufreq_p4_init(void)
296{
297 struct cpuinfo_x86 *c = &cpu_data(0);
298 int ret;
299
300 /*
301 * THERM_CONTROL is architectural for IA32 now, so
302 * we can rely on the capability checks
303 */
304 if (c->x86_vendor != X86_VENDOR_INTEL)
305 return -ENODEV;
306
307 if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
308 !test_cpu_cap(c, X86_FEATURE_ACC))
309 return -ENODEV;
310
311 ret = cpufreq_register_driver(&p4clockmod_driver);
312 if (!ret)
313 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
314 "Modulation available\n");
315
316 return ret;
317}
318
319
320static void __exit cpufreq_p4_exit(void)
321{
322 cpufreq_unregister_driver(&p4clockmod_driver);
323}
324
325
326MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
327MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
328MODULE_LICENSE("GPL");
329
330late_initcall(cpufreq_p4_init);
331module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 755a31e0f5b0..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,624 +0,0 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu __percpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return 0;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 in_params[0].type = ACPI_TYPE_BUFFER;
319 in_params[0].buffer.length = 16;
320 in_params[0].buffer.pointer = OSC_UUID;
321 in_params[1].type = ACPI_TYPE_INTEGER;
322 in_params[1].integer.value = 1;
323 in_params[2].type = ACPI_TYPE_INTEGER;
324 in_params[2].integer.value = 2;
325 in_params[3].type = ACPI_TYPE_BUFFER;
326 in_params[3].buffer.length = 8;
327 in_params[3].buffer.pointer = (u8 *)&capabilities;
328
329 capabilities[0] = OSC_QUERY_ENABLE;
330 capabilities[1] = 0x1;
331
332 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
333 if (ACPI_FAILURE(status))
334 return -ENODEV;
335
336 if (!output.length)
337 return -ENODEV;
338
339 out_obj = output.pointer;
340 if (out_obj->type != ACPI_TYPE_BUFFER) {
341 ret = -ENODEV;
342 goto out_free;
343 }
344
345 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
346 if (errors) {
347 ret = -ENODEV;
348 goto out_free;
349 }
350
351 supported = *((u32 *)(out_obj->buffer.pointer + 4));
352 if (!(supported & 0x1)) {
353 ret = -ENODEV;
354 goto out_free;
355 }
356
357 kfree(output.pointer);
358 capabilities[0] = 0x0;
359 capabilities[1] = 0x1;
360
361 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
362 if (ACPI_FAILURE(status))
363 return -ENODEV;
364
365 if (!output.length)
366 return -ENODEV;
367
368 out_obj = output.pointer;
369 if (out_obj->type != ACPI_TYPE_BUFFER) {
370 ret = -ENODEV;
371 goto out_free;
372 }
373
374 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
375 if (errors) {
376 ret = -ENODEV;
377 goto out_free;
378 }
379
380 supported = *((u32 *)(out_obj->buffer.pointer + 4));
381 if (!(supported & 0x1)) {
382 ret = -ENODEV;
383 goto out_free;
384 }
385
386out_free:
387 kfree(output.pointer);
388 return ret;
389}
390
391static int __init pcc_cpufreq_probe(void)
392{
393 acpi_status status;
394 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
395 struct pcc_memory_resource *mem_resource;
396 struct pcc_register_resource *reg_resource;
397 union acpi_object *out_obj, *member;
398 acpi_handle handle, osc_handle, pcch_handle;
399 int ret = 0;
400
401 status = acpi_get_handle(NULL, "\\_SB", &handle);
402 if (ACPI_FAILURE(status))
403 return -ENODEV;
404
405 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
406 if (ACPI_FAILURE(status))
407 return -ENODEV;
408
409 status = acpi_get_handle(handle, "_OSC", &osc_handle);
410 if (ACPI_SUCCESS(status)) {
411 ret = pcc_cpufreq_do_osc(&osc_handle);
412 if (ret)
413 dprintk("probe: _OSC evaluation did not succeed\n");
414 /* Firmware's use of _OSC is optional */
415 ret = 0;
416 }
417
418 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
419 if (ACPI_FAILURE(status))
420 return -ENODEV;
421
422 out_obj = output.pointer;
423 if (out_obj->type != ACPI_TYPE_PACKAGE) {
424 ret = -ENODEV;
425 goto out_free;
426 }
427
428 member = &out_obj->package.elements[0];
429 if (member->type != ACPI_TYPE_BUFFER) {
430 ret = -ENODEV;
431 goto out_free;
432 }
433
434 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
435
436 dprintk("probe: mem_resource descriptor: 0x%x,"
437 " length: %d, space_id: %d, resource_usage: %d,"
438 " type_specific: %d, granularity: 0x%llx,"
439 " minimum: 0x%llx, maximum: 0x%llx,"
440 " translation_offset: 0x%llx, address_length: 0x%llx\n",
441 mem_resource->descriptor, mem_resource->length,
442 mem_resource->space_id, mem_resource->resource_usage,
443 mem_resource->type_specific, mem_resource->granularity,
444 mem_resource->minimum, mem_resource->maximum,
445 mem_resource->translation_offset,
446 mem_resource->address_length);
447
448 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
449 ret = -ENODEV;
450 goto out_free;
451 }
452
453 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
454 mem_resource->address_length);
455 if (pcch_virt_addr == NULL) {
456 dprintk("probe: could not map shared mem region\n");
457 goto out_free;
458 }
459 pcch_hdr = pcch_virt_addr;
460
461 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
462 dprintk("probe: PCCH header is at physical address: 0x%llx,"
463 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
464 " supported features: 0x%x, command field: 0x%x,"
465 " status field: 0x%x, nominal latency: %d us\n",
466 mem_resource->minimum, ioread32(&pcch_hdr->signature),
467 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
468 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
469 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
470 ioread32(&pcch_hdr->latency));
471
472 dprintk("probe: min time between commands: %d us,"
473 " max time between commands: %d us,"
474 " nominal CPU frequency: %d MHz,"
475 " minimum CPU frequency: %d MHz,"
476 " minimum CPU frequency without throttling: %d MHz\n",
477 ioread32(&pcch_hdr->minimum_time),
478 ioread32(&pcch_hdr->maximum_time),
479 ioread32(&pcch_hdr->nominal),
480 ioread32(&pcch_hdr->throttled_frequency),
481 ioread32(&pcch_hdr->minimum_frequency));
482
483 member = &out_obj->package.elements[1];
484 if (member->type != ACPI_TYPE_BUFFER) {
485 ret = -ENODEV;
486 goto pcch_free;
487 }
488
489 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
490
491 doorbell.space_id = reg_resource->space_id;
492 doorbell.bit_width = reg_resource->bit_width;
493 doorbell.bit_offset = reg_resource->bit_offset;
494 doorbell.access_width = 64;
495 doorbell.address = reg_resource->address;
496
497 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
498 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
499 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
500 doorbell.access_width, reg_resource->address);
501
502 member = &out_obj->package.elements[2];
503 if (member->type != ACPI_TYPE_INTEGER) {
504 ret = -ENODEV;
505 goto pcch_free;
506 }
507
508 doorbell_preserve = member->integer.value;
509
510 member = &out_obj->package.elements[3];
511 if (member->type != ACPI_TYPE_INTEGER) {
512 ret = -ENODEV;
513 goto pcch_free;
514 }
515
516 doorbell_write = member->integer.value;
517
518 dprintk("probe: doorbell_preserve: 0x%llx,"
519 " doorbell_write: 0x%llx\n",
520 doorbell_preserve, doorbell_write);
521
522 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
523 if (!pcc_cpu_info) {
524 ret = -ENOMEM;
525 goto pcch_free;
526 }
527
528 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
529 " limits: %d MHz, %d MHz\n", PCC_VERSION,
530 ioread32(&pcch_hdr->minimum_frequency),
531 ioread32(&pcch_hdr->nominal));
532 kfree(output.pointer);
533 return ret;
534pcch_free:
535 pcc_clear_mapping();
536out_free:
537 kfree(output.pointer);
538 return ret;
539}
540
541static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
542{
543 unsigned int cpu = policy->cpu;
544 unsigned int result = 0;
545
546 if (!pcch_virt_addr) {
547 result = -1;
548 goto out;
549 }
550
551 result = pcc_get_offset(cpu);
552 if (result) {
553 dprintk("init: PCCP evaluation failed\n");
554 goto out;
555 }
556
557 policy->max = policy->cpuinfo.max_freq =
558 ioread32(&pcch_hdr->nominal) * 1000;
559 policy->min = policy->cpuinfo.min_freq =
560 ioread32(&pcch_hdr->minimum_frequency) * 1000;
561 policy->cur = pcc_get_freq(cpu);
562
563 if (!policy->cur) {
564 dprintk("init: Unable to get current CPU frequency\n");
565 result = -EINVAL;
566 goto out;
567 }
568
569 dprintk("init: policy->max is %d, policy->min is %d\n",
570 policy->max, policy->min);
571out:
572 return result;
573}
574
575static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
576{
577 return 0;
578}
579
580static struct cpufreq_driver pcc_cpufreq_driver = {
581 .flags = CPUFREQ_CONST_LOOPS,
582 .get = pcc_get_freq,
583 .verify = pcc_cpufreq_verify,
584 .target = pcc_cpufreq_target,
585 .init = pcc_cpufreq_cpu_init,
586 .exit = pcc_cpufreq_cpu_exit,
587 .name = "pcc-cpufreq",
588 .owner = THIS_MODULE,
589};
590
591static int __init pcc_cpufreq_init(void)
592{
593 int ret;
594
595 if (acpi_disabled)
596 return 0;
597
598 ret = pcc_cpufreq_probe();
599 if (ret) {
600 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
601 return ret;
602 }
603
604 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
605
606 return ret;
607}
608
609static void __exit pcc_cpufreq_exit(void)
610{
611 cpufreq_unregister_driver(&pcc_cpufreq_driver);
612
613 pcc_clear_mapping();
614
615 free_percpu(pcc_cpu_info);
616}
617
618MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
619MODULE_VERSION(PCC_VERSION);
620MODULE_DESCRIPTION("Processor Clocking Control interface driver");
621MODULE_LICENSE("GPL");
622
623late_initcall(pcc_cpufreq_init);
624module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/ioport.h>
16#include <linux/timex.h>
17#include <linux/io.h>
18
19#include <asm/msr.h>
20
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */
23
24#define PFX "powernow-k6: "
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state(unsigned int best_i)
71{
72 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR PFX "invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency
124 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
125 *
126 * sets a new CPUFreq policy
127 */
128static int powernow_k6_target(struct cpufreq_policy *policy,
129 unsigned int target_freq,
130 unsigned int relation)
131{
132 unsigned int newstate = 0;
133
134 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
135 target_freq, relation, &newstate))
136 return -EINVAL;
137
138 powernow_k6_set_state(newstate);
139
140 return 0;
141}
142
143
144static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
145{
146 unsigned int i, f;
147 int result;
148
149 if (policy->cpu != 0)
150 return -ENODEV;
151
152 /* get frequencies */
153 max_multiplier = powernow_k6_get_cpu_multiplier();
154 busfreq = cpu_khz / max_multiplier;
155
156 /* table init */
157 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
158 f = clock_ratio[i].index;
159 if (f > max_multiplier)
160 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
161 else
162 clock_ratio[i].frequency = busfreq * f;
163 }
164
165 /* cpuinfo and default policy values */
166 policy->cpuinfo.transition_latency = 200000;
167 policy->cur = busfreq * max_multiplier;
168
169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
170 if (result)
171 return result;
172
173 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
174
175 return 0;
176}
177
178
179static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
180{
181 unsigned int i;
182 for (i = 0; i < 8; i++) {
183 if (i == max_multiplier)
184 powernow_k6_set_state(i);
185 }
186 cpufreq_frequency_table_put_attr(policy->cpu);
187 return 0;
188}
189
190static unsigned int powernow_k6_get(unsigned int cpu)
191{
192 unsigned int ret;
193 ret = (busfreq * powernow_k6_get_cpu_multiplier());
194 return ret;
195}
196
197static struct freq_attr *powernow_k6_attr[] = {
198 &cpufreq_freq_attr_scaling_available_freqs,
199 NULL,
200};
201
202static struct cpufreq_driver powernow_k6_driver = {
203 .verify = powernow_k6_verify,
204 .target = powernow_k6_target,
205 .init = powernow_k6_cpu_init,
206 .exit = powernow_k6_cpu_exit,
207 .get = powernow_k6_get,
208 .name = "powernow-k6",
209 .owner = THIS_MODULE,
210 .attr = powernow_k6_attr,
211};
212
213
214/**
215 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
216 *
217 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
218 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
219 * on success.
220 */
221static int __init powernow_k6_init(void)
222{
223 struct cpuinfo_x86 *c = &cpu_data(0);
224
225 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
226 ((c->x86_model != 12) && (c->x86_model != 13)))
227 return -ENODEV;
228
229 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
230 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
231 return -EIO;
232 }
233
234 if (cpufreq_register_driver(&powernow_k6_driver)) {
235 release_region(POWERNOW_IOPORT, 16);
236 return -EINVAL;
237 }
238
239 return 0;
240}
241
242
243/**
244 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
245 *
246 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
247 */
248static void __exit powernow_k6_exit(void)
249{
250 cpufreq_unregister_driver(&powernow_k6_driver);
251 release_region(POWERNOW_IOPORT, 16);
252}
253
254
255MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
256 "Dominik Brodowski <linux@brodo.de>");
257MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
258MODULE_LICENSE("GPL");
259
260module_init(powernow_k6_init);
261module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41ba..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5:
10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
15 */
16
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/moduleparam.h>
20#include <linux/init.h>
21#include <linux/cpufreq.h>
22#include <linux/slab.h>
23#include <linux/string.h>
24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
29#include <asm/msr.h>
30#include <asm/system.h>
31
32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
33#include <linux/acpi.h>
34#include <acpi/processor.h>
35#endif
36
37#include "powernow-k7.h"
38
39#define PFX "powernow: "
40
41
42struct psb_s {
43 u8 signature[10];
44 u8 tableversion;
45 u8 flags;
46 u16 settlingtime;
47 u8 reserved1;
48 u8 numpst;
49};
50
51struct pst_s {
52 u32 cpuid;
53 u8 fsbspeed;
54 u8 maxfid;
55 u8 startvid;
56 u8 numpstates;
57};
58
59#ifdef CONFIG_X86_POWERNOW_K7_ACPI
60union powernow_acpi_control_t {
61 struct {
62 unsigned long fid:5,
63 vid:5,
64 sgtc:20,
65 res1:2;
66 } bits;
67 unsigned long val;
68};
69#endif
70
71#ifdef CONFIG_CPU_FREQ_DEBUG
72/* divide by 1000 to get VCore voltage in V. */
73static const int mobile_vid_table[32] = {
74 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
75 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
76 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
77 1075, 1050, 1025, 1000, 975, 950, 925, 0,
78};
79#endif
80
81/* divide by 10 to get FID. */
82static const int fid_codes[32] = {
83 110, 115, 120, 125, 50, 55, 60, 65,
84 70, 75, 80, 85, 90, 95, 100, 105,
85 30, 190, 40, 200, 130, 135, 140, 210,
86 150, 225, 160, 165, 170, 180, -1, -1,
87};
88
89/* This parameter is used in order to force ACPI instead of legacy method for
90 * configuration purpose.
91 */
92
93static int acpi_force;
94
95static struct cpufreq_frequency_table *powernow_table;
96
97static unsigned int can_scale_bus;
98static unsigned int can_scale_vid;
99static unsigned int minimum_speed = -1;
100static unsigned int maximum_speed;
101static unsigned int number_scales;
102static unsigned int fsb;
103static unsigned int latency;
104static char have_a0;
105
106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
108
109static int check_fsb(unsigned int fsbspeed)
110{
111 int delta;
112 unsigned int f = fsb / 1000;
113
114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
115 return delta < 5;
116}
117
118static int check_powernow(void)
119{
120 struct cpuinfo_x86 *c = &cpu_data(0);
121 unsigned int maxei, eax, ebx, ecx, edx;
122
123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
124#ifdef MODULE
125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
127#endif
128 return 0;
129 }
130
131 /* Get maximum capabilities */
132 maxei = cpuid_eax(0x80000000);
133 if (maxei < 0x80000007) { /* Any powernow info ? */
134#ifdef MODULE
135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
136#endif
137 return 0;
138 }
139
140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
143 have_a0 = 1;
144 }
145
146 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
147
148 /* Check we can actually do something before we say anything.*/
149 if (!(edx & (1 << 1 | 1 << 2)))
150 return 0;
151
152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
153
154 if (edx & 1 << 1) {
155 printk("frequency");
156 can_scale_bus = 1;
157 }
158
159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
160 printk(" and ");
161
162 if (edx & 1 << 2) {
163 printk("voltage");
164 can_scale_vid = 1;
165 }
166
167 printk(".\n");
168 return 1;
169}
170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
172static void invalidate_entry(unsigned int entry)
173{
174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
175}
176#endif
177
178static int get_ranges(unsigned char *pst)
179{
180 unsigned int j;
181 unsigned int speed;
182 u8 fid, vid;
183
184 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
185 (number_scales + 1)), GFP_KERNEL);
186 if (!powernow_table)
187 return -ENOMEM;
188
189 for (j = 0 ; j < number_scales; j++) {
190 fid = *pst++;
191
192 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
193 powernow_table[j].index = fid; /* lower 8 bits */
194
195 speed = powernow_table[j].frequency;
196
197 if ((fid_codes[fid] % 10) == 5) {
198#ifdef CONFIG_X86_POWERNOW_K7_ACPI
199 if (have_a0 == 1)
200 invalidate_entry(j);
201#endif
202 }
203
204 if (speed < minimum_speed)
205 minimum_speed = speed;
206 if (speed > maximum_speed)
207 maximum_speed = speed;
208
209 vid = *pst++;
210 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
211
212 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
213 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
214 fid_codes[fid] % 10, speed/1000, vid,
215 mobile_vid_table[vid]/1000,
216 mobile_vid_table[vid]%1000);
217 }
218 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
219 powernow_table[number_scales].index = 0;
220
221 return 0;
222}
223
224
225static void change_FID(int fid)
226{
227 union msr_fidvidctl fidvidctl;
228
229 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
230 if (fidvidctl.bits.FID != fid) {
231 fidvidctl.bits.SGTC = latency;
232 fidvidctl.bits.FID = fid;
233 fidvidctl.bits.VIDC = 0;
234 fidvidctl.bits.FIDC = 1;
235 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
236 }
237}
238
239
240static void change_VID(int vid)
241{
242 union msr_fidvidctl fidvidctl;
243
244 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
245 if (fidvidctl.bits.VID != vid) {
246 fidvidctl.bits.SGTC = latency;
247 fidvidctl.bits.VID = vid;
248 fidvidctl.bits.FIDC = 0;
249 fidvidctl.bits.VIDC = 1;
250 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
251 }
252}
253
254
255static void change_speed(unsigned int index)
256{
257 u8 fid, vid;
258 struct cpufreq_freqs freqs;
259 union msr_fidvidstatus fidvidstatus;
260 int cfid;
261
262 /* fid are the lower 8 bits of the index we stored into
263 * the cpufreq frequency table in powernow_decode_bios,
264 * vid are the upper 8 bits.
265 */
266
267 fid = powernow_table[index].index & 0xFF;
268 vid = (powernow_table[index].index & 0xFF00) >> 8;
269
270 freqs.cpu = 0;
271
272 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
273 cfid = fidvidstatus.bits.CFID;
274 freqs.old = fsb * fid_codes[cfid] / 10;
275
276 freqs.new = powernow_table[index].frequency;
277
278 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
279
280 /* Now do the magic poking into the MSRs. */
281
282 if (have_a0 == 1) /* A0 errata 5 */
283 local_irq_disable();
284
285 if (freqs.old > freqs.new) {
286 /* Going down, so change FID first */
287 change_FID(fid);
288 change_VID(vid);
289 } else {
290 /* Going up, so change VID first */
291 change_VID(vid);
292 change_FID(fid);
293 }
294
295
296 if (have_a0 == 1)
297 local_irq_enable();
298
299 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
300}
301
302
303#ifdef CONFIG_X86_POWERNOW_K7_ACPI
304
305static struct acpi_processor_performance *acpi_processor_perf;
306
307static int powernow_acpi_init(void)
308{
309 int i;
310 int retval = 0;
311 union powernow_acpi_control_t pc;
312
313 if (acpi_processor_perf != NULL && powernow_table != NULL) {
314 retval = -EINVAL;
315 goto err0;
316 }
317
318 acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
319 GFP_KERNEL);
320 if (!acpi_processor_perf) {
321 retval = -ENOMEM;
322 goto err0;
323 }
324
325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) {
327 retval = -ENOMEM;
328 goto err05;
329 }
330
331 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
332 retval = -EIO;
333 goto err1;
334 }
335
336 if (acpi_processor_perf->control_register.space_id !=
337 ACPI_ADR_SPACE_FIXED_HARDWARE) {
338 retval = -ENODEV;
339 goto err2;
340 }
341
342 if (acpi_processor_perf->status_register.space_id !=
343 ACPI_ADR_SPACE_FIXED_HARDWARE) {
344 retval = -ENODEV;
345 goto err2;
346 }
347
348 number_scales = acpi_processor_perf->state_count;
349
350 if (number_scales < 2) {
351 retval = -ENODEV;
352 goto err2;
353 }
354
355 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
356 (number_scales + 1)), GFP_KERNEL);
357 if (!powernow_table) {
358 retval = -ENOMEM;
359 goto err2;
360 }
361
362 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
363 for (i = 0; i < number_scales; i++) {
364 u8 fid, vid;
365 struct acpi_processor_px *state =
366 &acpi_processor_perf->states[i];
367 unsigned int speed, speed_mhz;
368
369 pc.val = (unsigned long) state->control;
370 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
371 i,
372 (u32) state->core_frequency,
373 (u32) state->power,
374 (u32) state->transition_latency,
375 (u32) state->control,
376 pc.bits.sgtc);
377
378 vid = pc.bits.vid;
379 fid = pc.bits.fid;
380
381 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
382 powernow_table[i].index = fid; /* lower 8 bits */
383 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
384
385 speed = powernow_table[i].frequency;
386 speed_mhz = speed / 1000;
387
388 /* processor_perflib will multiply the MHz value by 1000 to
389 * get a KHz value (e.g. 1266000). However, powernow-k7 works
390 * with true KHz values (e.g. 1266768). To ensure that all
391 * powernow frequencies are available, we must ensure that
392 * ACPI doesn't restrict them, so we round up the MHz value
393 * to ensure that perflib's computed KHz value is greater than
394 * or equal to powernow's KHz value.
395 */
396 if (speed % 1000 > 0)
397 speed_mhz++;
398
399 if ((fid_codes[fid] % 10) == 5) {
400 if (have_a0 == 1)
401 invalidate_entry(i);
402 }
403
404 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
405 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
406 fid_codes[fid] % 10, speed_mhz, vid,
407 mobile_vid_table[vid]/1000,
408 mobile_vid_table[vid]%1000);
409
410 if (state->core_frequency != speed_mhz) {
411 state->core_frequency = speed_mhz;
412 dprintk(" Corrected ACPI frequency to %d\n",
413 speed_mhz);
414 }
415
416 if (latency < pc.bits.sgtc)
417 latency = pc.bits.sgtc;
418
419 if (speed < minimum_speed)
420 minimum_speed = speed;
421 if (speed > maximum_speed)
422 maximum_speed = speed;
423 }
424
425 powernow_table[i].frequency = CPUFREQ_TABLE_END;
426 powernow_table[i].index = 0;
427
428 /* notify BIOS that we exist */
429 acpi_processor_notify_smm(THIS_MODULE);
430
431 return 0;
432
433err2:
434 acpi_processor_unregister_performance(acpi_processor_perf, 0);
435err1:
436 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
437err05:
438 kfree(acpi_processor_perf);
439err0:
440 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
441 "this platform\n");
442 acpi_processor_perf = NULL;
443 return retval;
444}
445#else
446static int powernow_acpi_init(void)
447{
448 printk(KERN_INFO PFX "no support for ACPI processor found."
449 " Please recompile your kernel with ACPI processor\n");
450 return -EINVAL;
451}
452#endif
453
454static void print_pst_entry(struct pst_s *pst, unsigned int j)
455{
456 dprintk("PST:%d (@%p)\n", j, pst);
457 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
458 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
459}
460
461static int powernow_decode_bios(int maxfid, int startvid)
462{
463 struct psb_s *psb;
464 struct pst_s *pst;
465 unsigned int i, j;
466 unsigned char *p;
467 unsigned int etuple;
468 unsigned int ret;
469
470 etuple = cpuid_eax(0x80000001);
471
472 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
473
474 p = phys_to_virt(i);
475
476 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
477 dprintk("Found PSB header at %p\n", p);
478 psb = (struct psb_s *) p;
479 dprintk("Table version: 0x%x\n", psb->tableversion);
480 if (psb->tableversion != 0x12) {
481 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
482 " supported right now\n");
483 return -ENODEV;
484 }
485
486 dprintk("Flags: 0x%x\n", psb->flags);
487 if ((psb->flags & 1) == 0)
488 dprintk("Mobile voltage regulator\n");
489 else
490 dprintk("Desktop voltage regulator\n");
491
492 latency = psb->settlingtime;
493 if (latency < 100) {
494 printk(KERN_INFO PFX "BIOS set settling time "
495 "to %d microseconds. "
496 "Should be at least 100. "
497 "Correcting.\n", latency);
498 latency = 100;
499 }
500 dprintk("Settling Time: %d microseconds.\n",
501 psb->settlingtime);
502 dprintk("Has %d PST tables. (Only dumping ones "
503 "relevant to this CPU).\n",
504 psb->numpst);
505
506 p += sizeof(struct psb_s);
507
508 pst = (struct pst_s *) p;
509
510 for (j = 0; j < psb->numpst; j++) {
511 pst = (struct pst_s *) p;
512 number_scales = pst->numpstates;
513
514 if ((etuple == pst->cpuid) &&
515 check_fsb(pst->fsbspeed) &&
516 (maxfid == pst->maxfid) &&
517 (startvid == pst->startvid)) {
518 print_pst_entry(pst, j);
519 p = (char *)pst + sizeof(struct pst_s);
520 ret = get_ranges(p);
521 return ret;
522 } else {
523 unsigned int k;
524 p = (char *)pst + sizeof(struct pst_s);
525 for (k = 0; k < number_scales; k++)
526 p += 2;
527 }
528 }
529 printk(KERN_INFO PFX "No PST tables match this cpuid "
530 "(0x%x)\n", etuple);
531 printk(KERN_INFO PFX "This is indicative of a broken "
532 "BIOS.\n");
533
534 return -EINVAL;
535 }
536 p++;
537 }
538
539 return -ENODEV;
540}
541
542
543static int powernow_target(struct cpufreq_policy *policy,
544 unsigned int target_freq,
545 unsigned int relation)
546{
547 unsigned int newstate;
548
549 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
550 relation, &newstate))
551 return -EINVAL;
552
553 change_speed(newstate);
554
555 return 0;
556}
557
558
559static int powernow_verify(struct cpufreq_policy *policy)
560{
561 return cpufreq_frequency_table_verify(policy, powernow_table);
562}
563
564/*
565 * We use the fact that the bus frequency is somehow
566 * a multiple of 100000/3 khz, then we compute sgtc according
567 * to this multiple.
568 * That way, we match more how AMD thinks all of that work.
569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS.
571 */
572static int __cpuinit fixup_sgtc(void)
573{
574 unsigned int sgtc;
575 unsigned int m;
576
577 m = fsb / 3333;
578 if ((m % 10) >= 5)
579 m += 5;
580
581 m /= 10;
582
583 sgtc = 100 * m * latency;
584 sgtc = sgtc / 3;
585 if (sgtc > 0xfffff) {
586 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
587 sgtc = 0xfffff;
588 }
589 return sgtc;
590}
591
592static unsigned int powernow_get(unsigned int cpu)
593{
594 union msr_fidvidstatus fidvidstatus;
595 unsigned int cfid;
596
597 if (cpu)
598 return 0;
599 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
600 cfid = fidvidstatus.bits.CFID;
601
602 return fsb * fid_codes[cfid] / 10;
603}
604
605
606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{
608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n",
610 d->ident);
611 printk(KERN_WARNING PFX
612 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
613 "BIOS than 3A71 (01/20/2003)\n");
614 printk(KERN_WARNING PFX
615 "cpufreq scaling has been disabled as a result of this.\n");
616 return 0;
617}
618
619/*
620 * Some Athlon laptops have really fucked PST tables.
621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq.
623 */
624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 {
626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire",
628 .matches = {
629 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
630 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
631 },
632 },
633 { }
634};
635
636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{
638 union msr_fidvidstatus fidvidstatus;
639 int result;
640
641 if (policy->cpu != 0)
642 return -ENODEV;
643
644 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
645
646 recalibrate_cpu_khz();
647
648 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
649 if (!fsb) {
650 printk(KERN_WARNING PFX "can not determine bus frequency\n");
651 return -EINVAL;
652 }
653 dprintk("FSB: %3dMHz\n", fsb/1000);
654
655 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
656 printk(KERN_INFO PFX "PSB/PST known to be broken. "
657 "Trying ACPI instead\n");
658 result = powernow_acpi_init();
659 } else {
660 result = powernow_decode_bios(fidvidstatus.bits.MFID,
661 fidvidstatus.bits.SVID);
662 if (result) {
663 printk(KERN_INFO PFX "Trying ACPI perflib\n");
664 maximum_speed = 0;
665 minimum_speed = -1;
666 latency = 0;
667 result = powernow_acpi_init();
668 if (result) {
669 printk(KERN_INFO PFX
670 "ACPI and legacy methods failed\n");
671 }
672 } else {
673 /* SGTC use the bus clock as timer */
674 latency = fixup_sgtc();
675 printk(KERN_INFO PFX "SGTC: %d\n", latency);
676 }
677 }
678
679 if (result)
680 return result;
681
682 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
683 minimum_speed/1000, maximum_speed/1000);
684
685 policy->cpuinfo.transition_latency =
686 cpufreq_scale(2000000UL, fsb, latency);
687
688 policy->cur = powernow_get(0);
689
690 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
691
692 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
693}
694
695static int powernow_cpu_exit(struct cpufreq_policy *policy)
696{
697 cpufreq_frequency_table_put_attr(policy->cpu);
698
699#ifdef CONFIG_X86_POWERNOW_K7_ACPI
700 if (acpi_processor_perf) {
701 acpi_processor_unregister_performance(acpi_processor_perf, 0);
702 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
703 kfree(acpi_processor_perf);
704 }
705#endif
706
707 kfree(powernow_table);
708 return 0;
709}
710
711static struct freq_attr *powernow_table_attr[] = {
712 &cpufreq_freq_attr_scaling_available_freqs,
713 NULL,
714};
715
716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify,
718 .target = powernow_target,
719 .get = powernow_get,
720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .bios_limit = acpi_processor_get_bios_limit,
722#endif
723 .init = powernow_cpu_init,
724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
728};
729
730static int __init powernow_init(void)
731{
732 if (check_powernow() == 0)
733 return -ENODEV;
734 return cpufreq_register_driver(&powernow_driver);
735}
736
737
738static void __exit powernow_exit(void)
739{
740 cpufreq_unregister_driver(&powernow_driver);
741}
742
743module_param(acpi_force, int, 0444);
744MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
745
746MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
747MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
748MODULE_LICENSE("GPL");
749
750late_initcall(powernow_init);
751module_exit(powernow_exit);
752
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
1/*
2 * (C) 2003 Dave Jones.
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * AMD-specific information
7 *
8 */
9
10union msr_fidvidctl {
11 struct {
12 unsigned FID:5, // 4:0
13 reserved1:3, // 7:5
14 VID:5, // 12:8
15 reserved2:3, // 15:13
16 FIDC:1, // 16
17 VIDC:1, // 17
18 reserved3:2, // 19:18
19 FIDCHGRATIO:1, // 20
20 reserved4:11, // 31-21
21 SGTC:20, // 32:51
22 reserved5:12; // 63:52
23 } bits;
24 unsigned long long val;
25};
26
27union msr_fidvidstatus {
28 struct {
29 unsigned CFID:5, // 4:0
30 reserved1:3, // 7:5
31 SFID:5, // 12:8
32 reserved2:3, // 15:13
33 MFID:5, // 20:16
34 reserved3:11, // 31:21
35 CVID:5, // 36:32
36 reserved4:3, // 39:37
37 SVID:5, // 44:40
38 reserved5:3, // 47:45
39 MVID:5, // 52:48
40 reserved6:11; // 63:53
41 } bits;
42 unsigned long long val;
43};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2368e38327b3..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
1/*
2 * (c) 2003-2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : mark.langsdorf@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, Jacob Shin, and others.
18 * Originally developed by Paul Devriendt.
19 * Processor information obtained from Chapter 9 (Power and Thermal Management)
20 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21 * Opteron Processors" available for download from www.amd.com
22 *
23 * Tables for specific CPUs can be inferred from
24 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25 */
26
27#include <linux/kernel.h>
28#include <linux/smp.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/slab.h>
33#include <linux/string.h>
34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
38
39#include <asm/msr.h>
40
41#include <linux/acpi.h>
42#include <linux/mutex.h>
43#include <acpi/processor.h>
44
45#define PFX "powernow-k8: "
46#define VERSION "version 2.20.00"
47#include "powernow-k8.h"
48#include "mperf.h"
49
50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex);
52
53static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54
55static int cpu_family = CPU_OPTERON;
56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
63#ifndef CONFIG_SMP
64static inline const struct cpumask *cpu_core_mask(int cpu)
65{
66 return cpumask_of(0);
67}
68#endif
69
70/* Return a frequency in MHz, given an input fid */
71static u32 find_freq_from_fid(u32 fid)
72{
73 return 800 + (fid * 100);
74}
75
76/* Return a frequency in KHz, given an input fid */
77static u32 find_khz_freq_from_fid(u32 fid)
78{
79 return 1000 * find_freq_from_fid(fid);
80}
81
82static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
83 u32 pstate)
84{
85 return data[pstate].frequency;
86}
87
88/* Return the vco fid for an input fid
89 *
90 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
91 * only from corresponding high fids. This returns "high" fid corresponding to
92 * "low" one.
93 */
94static u32 convert_fid_to_vco_fid(u32 fid)
95{
96 if (fid < HI_FID_TABLE_BOTTOM)
97 return 8 + (2 * fid);
98 else
99 return fid;
100}
101
102/*
103 * Return 1 if the pending bit is set. Unless we just instructed the processor
104 * to transition to a new state, seeing this bit set is really bad news.
105 */
106static int pending_bit_stuck(void)
107{
108 u32 lo, hi;
109
110 if (cpu_family == CPU_HW_PSTATE)
111 return 0;
112
113 rdmsr(MSR_FIDVID_STATUS, lo, hi);
114 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
115}
116
117/*
118 * Update the global current fid / vid values from the status msr.
119 * Returns 1 on error.
120 */
121static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
122{
123 u32 lo, hi;
124 u32 i = 0;
125
126 if (cpu_family == CPU_HW_PSTATE) {
127 rdmsr(MSR_PSTATE_STATUS, lo, hi);
128 i = lo & HW_PSTATE_MASK;
129 data->currpstate = i;
130
131 /*
132 * a workaround for family 11h erratum 311 might cause
133 * an "out-of-range Pstate if the core is in Pstate-0
134 */
135 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
136 data->currpstate = HW_PSTATE_0;
137
138 return 0;
139 }
140 do {
141 if (i++ > 10000) {
142 dprintk("detected change pending stuck\n");
143 return 1;
144 }
145 rdmsr(MSR_FIDVID_STATUS, lo, hi);
146 } while (lo & MSR_S_LO_CHANGE_PENDING);
147
148 data->currvid = hi & MSR_S_HI_CURRENT_VID;
149 data->currfid = lo & MSR_S_LO_CURRENT_FID;
150
151 return 0;
152}
153
154/* the isochronous relief time */
155static void count_off_irt(struct powernow_k8_data *data)
156{
157 udelay((1 << data->irt) * 10);
158 return;
159}
160
161/* the voltage stabilization time */
162static void count_off_vst(struct powernow_k8_data *data)
163{
164 udelay(data->vstable * VST_UNITS_20US);
165 return;
166}
167
168/* need to init the control msr to a safe value (for each cpu) */
169static void fidvid_msr_init(void)
170{
171 u32 lo, hi;
172 u8 fid, vid;
173
174 rdmsr(MSR_FIDVID_STATUS, lo, hi);
175 vid = hi & MSR_S_HI_CURRENT_VID;
176 fid = lo & MSR_S_LO_CURRENT_FID;
177 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
178 hi = MSR_C_HI_STP_GNT_BENIGN;
179 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
180 wrmsr(MSR_FIDVID_CTL, lo, hi);
181}
182
183/* write the new fid value along with the other control fields to the msr */
184static int write_new_fid(struct powernow_k8_data *data, u32 fid)
185{
186 u32 lo;
187 u32 savevid = data->currvid;
188 u32 i = 0;
189
190 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
191 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
192 return 1;
193 }
194
195 lo = fid;
196 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
197 lo |= MSR_C_LO_INIT_FID_VID;
198
199 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
200 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
201
202 do {
203 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
204 if (i++ > 100) {
205 printk(KERN_ERR PFX
206 "Hardware error - pending bit very stuck - "
207 "no further pstate changes possible\n");
208 return 1;
209 }
210 } while (query_current_values_with_pending_wait(data));
211
212 count_off_irt(data);
213
214 if (savevid != data->currvid) {
215 printk(KERN_ERR PFX
216 "vid change on fid trans, old 0x%x, new 0x%x\n",
217 savevid, data->currvid);
218 return 1;
219 }
220
221 if (fid != data->currfid) {
222 printk(KERN_ERR PFX
223 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
224 data->currfid);
225 return 1;
226 }
227
228 return 0;
229}
230
231/* Write a new vid to the hardware */
232static int write_new_vid(struct powernow_k8_data *data, u32 vid)
233{
234 u32 lo;
235 u32 savefid = data->currfid;
236 int i = 0;
237
238 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
239 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
240 return 1;
241 }
242
243 lo = data->currfid;
244 lo |= (vid << MSR_C_LO_VID_SHIFT);
245 lo |= MSR_C_LO_INIT_FID_VID;
246
247 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
248 vid, lo, STOP_GRANT_5NS);
249
250 do {
251 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
252 if (i++ > 100) {
253 printk(KERN_ERR PFX "internal error - pending bit "
254 "very stuck - no further pstate "
255 "changes possible\n");
256 return 1;
257 }
258 } while (query_current_values_with_pending_wait(data));
259
260 if (savefid != data->currfid) {
261 printk(KERN_ERR PFX "fid changed on vid trans, old "
262 "0x%x new 0x%x\n",
263 savefid, data->currfid);
264 return 1;
265 }
266
267 if (vid != data->currvid) {
268 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
269 "curr 0x%x\n",
270 vid, data->currvid);
271 return 1;
272 }
273
274 return 0;
275}
276
277/*
278 * Reduce the vid by the max of step or reqvid.
279 * Decreasing vid codes represent increasing voltages:
280 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
281 */
282static int decrease_vid_code_by_step(struct powernow_k8_data *data,
283 u32 reqvid, u32 step)
284{
285 if ((data->currvid - reqvid) > step)
286 reqvid = data->currvid - step;
287
288 if (write_new_vid(data, reqvid))
289 return 1;
290
291 count_off_vst(data);
292
293 return 0;
294}
295
296/* Change hardware pstate by single MSR write */
297static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
298{
299 wrmsr(MSR_PSTATE_CTRL, pstate, 0);
300 data->currpstate = pstate;
301 return 0;
302}
303
304/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
305static int transition_fid_vid(struct powernow_k8_data *data,
306 u32 reqfid, u32 reqvid)
307{
308 if (core_voltage_pre_transition(data, reqvid, reqfid))
309 return 1;
310
311 if (core_frequency_transition(data, reqfid))
312 return 1;
313
314 if (core_voltage_post_transition(data, reqvid))
315 return 1;
316
317 if (query_current_values_with_pending_wait(data))
318 return 1;
319
320 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
321 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
322 "curr 0x%x 0x%x\n",
323 smp_processor_id(),
324 reqfid, reqvid, data->currfid, data->currvid);
325 return 1;
326 }
327
328 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
329 smp_processor_id(), data->currfid, data->currvid);
330
331 return 0;
332}
333
334/* Phase 1 - core voltage transition ... setup voltage */
335static int core_voltage_pre_transition(struct powernow_k8_data *data,
336 u32 reqvid, u32 reqfid)
337{
338 u32 rvosteps = data->rvo;
339 u32 savefid = data->currfid;
340 u32 maxvid, lo, rvomult = 1;
341
342 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
343 "reqvid 0x%x, rvo 0x%x\n",
344 smp_processor_id(),
345 data->currfid, data->currvid, reqvid, data->rvo);
346
347 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
348 rvomult = 2;
349 rvosteps *= rvomult;
350 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
351 maxvid = 0x1f & (maxvid >> 16);
352 dprintk("ph1 maxvid=0x%x\n", maxvid);
353 if (reqvid < maxvid) /* lower numbers are higher voltages */
354 reqvid = maxvid;
355
356 while (data->currvid > reqvid) {
357 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
358 data->currvid, reqvid);
359 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
360 return 1;
361 }
362
363 while ((rvosteps > 0) &&
364 ((rvomult * data->rvo + data->currvid) > reqvid)) {
365 if (data->currvid == maxvid) {
366 rvosteps = 0;
367 } else {
368 dprintk("ph1: changing vid for rvo, req 0x%x\n",
369 data->currvid - 1);
370 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
371 return 1;
372 rvosteps--;
373 }
374 }
375
376 if (query_current_values_with_pending_wait(data))
377 return 1;
378
379 if (savefid != data->currfid) {
380 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
381 data->currfid);
382 return 1;
383 }
384
385 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
386 data->currfid, data->currvid);
387
388 return 0;
389}
390
391/* Phase 2 - core frequency transition */
392static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
393{
394 u32 vcoreqfid, vcocurrfid, vcofiddiff;
395 u32 fid_interval, savevid = data->currvid;
396
397 if (data->currfid == reqfid) {
398 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
399 data->currfid);
400 return 0;
401 }
402
403 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
404 "reqfid 0x%x\n",
405 smp_processor_id(),
406 data->currfid, data->currvid, reqfid);
407
408 vcoreqfid = convert_fid_to_vco_fid(reqfid);
409 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
410 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
411 : vcoreqfid - vcocurrfid;
412
413 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
414 vcofiddiff = 0;
415
416 while (vcofiddiff > 2) {
417 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
418
419 if (reqfid > data->currfid) {
420 if (data->currfid > LO_FID_TABLE_TOP) {
421 if (write_new_fid(data,
422 data->currfid + fid_interval))
423 return 1;
424 } else {
425 if (write_new_fid
426 (data,
427 2 + convert_fid_to_vco_fid(data->currfid)))
428 return 1;
429 }
430 } else {
431 if (write_new_fid(data, data->currfid - fid_interval))
432 return 1;
433 }
434
435 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
436 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
437 : vcoreqfid - vcocurrfid;
438 }
439
440 if (write_new_fid(data, reqfid))
441 return 1;
442
443 if (query_current_values_with_pending_wait(data))
444 return 1;
445
446 if (data->currfid != reqfid) {
447 printk(KERN_ERR PFX
448 "ph2: mismatch, failed fid transition, "
449 "curr 0x%x, req 0x%x\n",
450 data->currfid, reqfid);
451 return 1;
452 }
453
454 if (savevid != data->currvid) {
455 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
456 savevid, data->currvid);
457 return 1;
458 }
459
460 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
461 data->currfid, data->currvid);
462
463 return 0;
464}
465
466/* Phase 3 - core voltage transition flow ... jump to the final vid. */
467static int core_voltage_post_transition(struct powernow_k8_data *data,
468 u32 reqvid)
469{
470 u32 savefid = data->currfid;
471 u32 savereqvid = reqvid;
472
473 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
474 smp_processor_id(),
475 data->currfid, data->currvid);
476
477 if (reqvid != data->currvid) {
478 if (write_new_vid(data, reqvid))
479 return 1;
480
481 if (savefid != data->currfid) {
482 printk(KERN_ERR PFX
483 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
484 savefid, data->currfid);
485 return 1;
486 }
487
488 if (data->currvid != reqvid) {
489 printk(KERN_ERR PFX
490 "ph3: failed vid transition\n, "
491 "req 0x%x, curr 0x%x",
492 reqvid, data->currvid);
493 return 1;
494 }
495 }
496
497 if (query_current_values_with_pending_wait(data))
498 return 1;
499
500 if (savereqvid != data->currvid) {
501 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
502 return 1;
503 }
504
505 if (savefid != data->currfid) {
506 dprintk("ph3 failed, currfid changed 0x%x\n",
507 data->currfid);
508 return 1;
509 }
510
511 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
512 data->currfid, data->currvid);
513
514 return 0;
515}
516
517static void check_supported_cpu(void *_rc)
518{
519 u32 eax, ebx, ecx, edx;
520 int *rc = _rc;
521
522 *rc = -ENODEV;
523
524 if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
525 return;
526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
528 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
529 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
530 return;
531
532 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
533 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
534 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
535 printk(KERN_INFO PFX
536 "Processor cpuid %x not supported\n", eax);
537 return;
538 }
539
540 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
541 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
542 printk(KERN_INFO PFX
543 "No frequency change capabilities detected\n");
544 return;
545 }
546
547 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
548 if ((edx & P_STATE_TRANSITION_CAPABLE)
549 != P_STATE_TRANSITION_CAPABLE) {
550 printk(KERN_INFO PFX
551 "Power state transitions not supported\n");
552 return;
553 }
554 } else { /* must be a HW Pstate capable processor */
555 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
556 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
557 cpu_family = CPU_HW_PSTATE;
558 else
559 return;
560 }
561
562 *rc = 0;
563}
564
565static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
566 u8 maxvid)
567{
568 unsigned int j;
569 u8 lastfid = 0xff;
570
571 for (j = 0; j < data->numps; j++) {
572 if (pst[j].vid > LEAST_VID) {
573 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
574 j, pst[j].vid);
575 return -EINVAL;
576 }
577 if (pst[j].vid < data->rvo) {
578 /* vid + rvo >= 0 */
579 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
580 " %d\n", j);
581 return -ENODEV;
582 }
583 if (pst[j].vid < maxvid + data->rvo) {
584 /* vid + rvo >= maxvid */
585 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
586 " %d\n", j);
587 return -ENODEV;
588 }
589 if (pst[j].fid > MAX_FID) {
590 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
591 " %d\n", j);
592 return -ENODEV;
593 }
594 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
595 /* Only first fid is allowed to be in "low" range */
596 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
597 "0x%x\n", j, pst[j].fid);
598 return -EINVAL;
599 }
600 if (pst[j].fid < lastfid)
601 lastfid = pst[j].fid;
602 }
603 if (lastfid & 1) {
604 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
605 return -EINVAL;
606 }
607 if (lastfid > LO_FID_TABLE_TOP)
608 printk(KERN_INFO FW_BUG PFX
609 "first fid not from lo freq table\n");
610
611 return 0;
612}
613
614static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
615 unsigned int entry)
616{
617 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
618}
619
620static void print_basics(struct powernow_k8_data *data)
621{
622 int j;
623 for (j = 0; j < data->numps; j++) {
624 if (data->powernow_table[j].frequency !=
625 CPUFREQ_ENTRY_INVALID) {
626 if (cpu_family == CPU_HW_PSTATE) {
627 printk(KERN_INFO PFX
628 " %d : pstate %d (%d MHz)\n", j,
629 data->powernow_table[j].index,
630 data->powernow_table[j].frequency/1000);
631 } else {
632 printk(KERN_INFO PFX
633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 data->powernow_table[j].index & 0xff,
635 data->powernow_table[j].frequency/1000,
636 data->powernow_table[j].index >> 8);
637 }
638 }
639 }
640 if (data->batps)
641 printk(KERN_INFO PFX "Only %d pstates on battery\n",
642 data->batps);
643}
644
645static u32 freq_from_fid_did(u32 fid, u32 did)
646{
647 u32 mhz = 0;
648
649 if (boot_cpu_data.x86 == 0x10)
650 mhz = (100 * (fid + 0x10)) >> did;
651 else if (boot_cpu_data.x86 == 0x11)
652 mhz = (100 * (fid + 8)) >> did;
653 else
654 BUG();
655
656 return mhz * 1000;
657}
658
659static int fill_powernow_table(struct powernow_k8_data *data,
660 struct pst_s *pst, u8 maxvid)
661{
662 struct cpufreq_frequency_table *powernow_table;
663 unsigned int j;
664
665 if (data->batps) {
666 /* use ACPI support to get full speed on mains power */
667 printk(KERN_WARNING PFX
668 "Only %d pstates usable (use ACPI driver for full "
669 "range\n", data->batps);
670 data->numps = data->batps;
671 }
672
673 for (j = 1; j < data->numps; j++) {
674 if (pst[j-1].fid >= pst[j].fid) {
675 printk(KERN_ERR PFX "PST out of sequence\n");
676 return -EINVAL;
677 }
678 }
679
680 if (data->numps < 2) {
681 printk(KERN_ERR PFX "no p states to transition\n");
682 return -ENODEV;
683 }
684
685 if (check_pst_table(data, pst, maxvid))
686 return -EINVAL;
687
688 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
689 * (data->numps + 1)), GFP_KERNEL);
690 if (!powernow_table) {
691 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
692 return -ENOMEM;
693 }
694
695 for (j = 0; j < data->numps; j++) {
696 int freq;
697 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
698 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
699 freq = find_khz_freq_from_fid(pst[j].fid);
700 powernow_table[j].frequency = freq;
701 }
702 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
703 powernow_table[data->numps].index = 0;
704
705 if (query_current_values_with_pending_wait(data)) {
706 kfree(powernow_table);
707 return -EIO;
708 }
709
710 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
711 data->powernow_table = powernow_table;
712 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
713 print_basics(data);
714
715 for (j = 0; j < data->numps; j++)
716 if ((pst[j].fid == data->currfid) &&
717 (pst[j].vid == data->currvid))
718 return 0;
719
720 dprintk("currfid/vid do not match PST, ignoring\n");
721 return 0;
722}
723
724/* Find and validate the PSB/PST table in BIOS. */
725static int find_psb_table(struct powernow_k8_data *data)
726{
727 struct psb_s *psb;
728 unsigned int i;
729 u32 mvs;
730 u8 maxvid;
731 u32 cpst = 0;
732 u32 thiscpuid;
733
734 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
735 /* Scan BIOS looking for the signature. */
736 /* It can not be at ffff0 - it is too big. */
737
738 psb = phys_to_virt(i);
739 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
740 continue;
741
742 dprintk("found PSB header at 0x%p\n", psb);
743
744 dprintk("table vers: 0x%x\n", psb->tableversion);
745 if (psb->tableversion != PSB_VERSION_1_4) {
746 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
747 return -ENODEV;
748 }
749
750 dprintk("flags: 0x%x\n", psb->flags1);
751 if (psb->flags1) {
752 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
753 return -ENODEV;
754 }
755
756 data->vstable = psb->vstable;
757 dprintk("voltage stabilization time: %d(*20us)\n",
758 data->vstable);
759
760 dprintk("flags2: 0x%x\n", psb->flags2);
761 data->rvo = psb->flags2 & 3;
762 data->irt = ((psb->flags2) >> 2) & 3;
763 mvs = ((psb->flags2) >> 4) & 3;
764 data->vidmvs = 1 << mvs;
765 data->batps = ((psb->flags2) >> 6) & 3;
766
767 dprintk("ramp voltage offset: %d\n", data->rvo);
768 dprintk("isochronous relief time: %d\n", data->irt);
769 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
770
771 dprintk("numpst: 0x%x\n", psb->num_tables);
772 cpst = psb->num_tables;
773 if ((psb->cpuid == 0x00000fc0) ||
774 (psb->cpuid == 0x00000fe0)) {
775 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
776 if ((thiscpuid == 0x00000fc0) ||
777 (thiscpuid == 0x00000fe0))
778 cpst = 1;
779 }
780 if (cpst != 1) {
781 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
782 return -ENODEV;
783 }
784
785 data->plllock = psb->plllocktime;
786 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
787 dprintk("maxfid: 0x%x\n", psb->maxfid);
788 dprintk("maxvid: 0x%x\n", psb->maxvid);
789 maxvid = psb->maxvid;
790
791 data->numps = psb->numps;
792 dprintk("numpstates: 0x%x\n", data->numps);
793 return fill_powernow_table(data,
794 (struct pst_s *)(psb+1), maxvid);
795 }
796 /*
797 * If you see this message, complain to BIOS manufacturer. If
798 * he tells you "we do not support Linux" or some similar
799 * nonsense, remember that Windows 2000 uses the same legacy
800 * mechanism that the old Linux PSB driver uses. Tell them it
801 * is broken with Windows 2000.
802 *
803 * The reference to the AMD documentation is chapter 9 in the
804 * BIOS and Kernel Developer's Guide, which is available on
805 * www.amd.com
806 */
807 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
808 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
809 " and Cool'N'Quiet support is enabled in BIOS setup\n");
810 return -ENODEV;
811}
812
813static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
814 unsigned int index)
815{
816 u64 control;
817
818 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
819 return;
820
821 control = data->acpi_data.states[index].control;
822 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
823 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
824 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
825 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
826 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
827 data->vstable = (control >> VST_SHIFT) & VST_MASK;
828}
829
830static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
831{
832 struct cpufreq_frequency_table *powernow_table;
833 int ret_val = -ENODEV;
834 u64 control, status;
835
836 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
837 dprintk("register performance failed: bad ACPI data\n");
838 return -EIO;
839 }
840
841 /* verify the data contained in the ACPI structures */
842 if (data->acpi_data.state_count <= 1) {
843 dprintk("No ACPI P-States\n");
844 goto err_out;
845 }
846
847 control = data->acpi_data.control_register.space_id;
848 status = data->acpi_data.status_register.space_id;
849
850 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
851 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
852 dprintk("Invalid control/status registers (%x - %x)\n",
853 control, status);
854 goto err_out;
855 }
856
857 /* fill in data->powernow_table */
858 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
859 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
860 if (!powernow_table) {
861 dprintk("powernow_table memory alloc failure\n");
862 goto err_out;
863 }
864
865 /* fill in data */
866 data->numps = data->acpi_data.state_count;
867 powernow_k8_acpi_pst_values(data, 0);
868
869 if (cpu_family == CPU_HW_PSTATE)
870 ret_val = fill_powernow_table_pstate(data, powernow_table);
871 else
872 ret_val = fill_powernow_table_fidvid(data, powernow_table);
873 if (ret_val)
874 goto err_out_mem;
875
876 powernow_table[data->acpi_data.state_count].frequency =
877 CPUFREQ_TABLE_END;
878 powernow_table[data->acpi_data.state_count].index = 0;
879 data->powernow_table = powernow_table;
880
881 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
882 print_basics(data);
883
884 /* notify BIOS that we exist */
885 acpi_processor_notify_smm(THIS_MODULE);
886
887 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
888 printk(KERN_ERR PFX
889 "unable to alloc powernow_k8_data cpumask\n");
890 ret_val = -ENOMEM;
891 goto err_out_mem;
892 }
893
894 return 0;
895
896err_out_mem:
897 kfree(powernow_table);
898
899err_out:
900 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
901
902 /* data->acpi_data.state_count informs us at ->exit()
903 * whether ACPI was used */
904 data->acpi_data.state_count = 0;
905
906 return ret_val;
907}
908
909static int fill_powernow_table_pstate(struct powernow_k8_data *data,
910 struct cpufreq_frequency_table *powernow_table)
911{
912 int i;
913 u32 hi = 0, lo = 0;
914 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
915 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
916
917 for (i = 0; i < data->acpi_data.state_count; i++) {
918 u32 index;
919
920 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
921 if (index > data->max_hw_pstate) {
922 printk(KERN_ERR PFX "invalid pstate %d - "
923 "bad value %d.\n", i, index);
924 printk(KERN_ERR PFX "Please report to BIOS "
925 "manufacturer\n");
926 invalidate_entry(powernow_table, i);
927 continue;
928 }
929 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
930 if (!(hi & HW_PSTATE_VALID_MASK)) {
931 dprintk("invalid pstate %d, ignoring\n", index);
932 invalidate_entry(powernow_table, i);
933 continue;
934 }
935
936 powernow_table[i].index = index;
937
938 /* Frequency may be rounded for these */
939 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
940 || boot_cpu_data.x86 == 0x11) {
941 powernow_table[i].frequency =
942 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
943 } else
944 powernow_table[i].frequency =
945 data->acpi_data.states[i].core_frequency * 1000;
946 }
947 return 0;
948}
949
950static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
951 struct cpufreq_frequency_table *powernow_table)
952{
953 int i;
954
955 for (i = 0; i < data->acpi_data.state_count; i++) {
956 u32 fid;
957 u32 vid;
958 u32 freq, index;
959 u64 status, control;
960
961 if (data->exttype) {
962 status = data->acpi_data.states[i].status;
963 fid = status & EXT_FID_MASK;
964 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
965 } else {
966 control = data->acpi_data.states[i].control;
967 fid = control & FID_MASK;
968 vid = (control >> VID_SHIFT) & VID_MASK;
969 }
970
971 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
972
973 index = fid | (vid<<8);
974 powernow_table[i].index = index;
975
976 freq = find_khz_freq_from_fid(fid);
977 powernow_table[i].frequency = freq;
978
979 /* verify frequency is OK */
980 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
981 dprintk("invalid freq %u kHz, ignoring\n", freq);
982 invalidate_entry(powernow_table, i);
983 continue;
984 }
985
986 /* verify voltage is OK -
987 * BIOSs are using "off" to indicate invalid */
988 if (vid == VID_OFF) {
989 dprintk("invalid vid %u, ignoring\n", vid);
990 invalidate_entry(powernow_table, i);
991 continue;
992 }
993
994 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
995 printk(KERN_INFO PFX "invalid freq entries "
996 "%u kHz vs. %u kHz\n", freq,
997 (unsigned int)
998 (data->acpi_data.states[i].core_frequency
999 * 1000));
1000 invalidate_entry(powernow_table, i);
1001 continue;
1002 }
1003 }
1004 return 0;
1005}
1006
1007static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
1008{
1009 if (data->acpi_data.state_count)
1010 acpi_processor_unregister_performance(&data->acpi_data,
1011 data->cpu);
1012 free_cpumask_var(data->acpi_data.shared_cpu_map);
1013}
1014
1015static int get_transition_latency(struct powernow_k8_data *data)
1016{
1017 int max_latency = 0;
1018 int i;
1019 for (i = 0; i < data->acpi_data.state_count; i++) {
1020 int cur_latency = data->acpi_data.states[i].transition_latency
1021 + data->acpi_data.states[i].bus_master_latency;
1022 if (cur_latency > max_latency)
1023 max_latency = cur_latency;
1024 }
1025 if (max_latency == 0) {
1026 /*
1027 * Fam 11h and later may return 0 as transition latency. This
1028 * is intended and means "very fast". While cpufreq core and
1029 * governors currently can handle that gracefully, better set it
1030 * to 1 to avoid problems in the future.
1031 */
1032 if (boot_cpu_data.x86 < 0x11)
1033 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1034 "latency\n");
1035 max_latency = 1;
1036 }
1037 /* value in usecs, needs to be in nanoseconds */
1038 return 1000 * max_latency;
1039}
1040
1041/* Take a frequency, and issue the fid/vid transition command */
1042static int transition_frequency_fidvid(struct powernow_k8_data *data,
1043 unsigned int index)
1044{
1045 u32 fid = 0;
1046 u32 vid = 0;
1047 int res, i;
1048 struct cpufreq_freqs freqs;
1049
1050 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1051
1052 /* fid/vid correctness check for k8 */
1053 /* fid are the lower 8 bits of the index we stored into
1054 * the cpufreq frequency table in find_psb_table, vid
1055 * are the upper 8 bits.
1056 */
1057 fid = data->powernow_table[index].index & 0xFF;
1058 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
1059
1060 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
1061
1062 if (query_current_values_with_pending_wait(data))
1063 return 1;
1064
1065 if ((data->currvid == vid) && (data->currfid == fid)) {
1066 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
1067 fid, vid);
1068 return 0;
1069 }
1070
1071 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1072 smp_processor_id(), fid, vid);
1073 freqs.old = find_khz_freq_from_fid(data->currfid);
1074 freqs.new = find_khz_freq_from_fid(fid);
1075
1076 for_each_cpu(i, data->available_cores) {
1077 freqs.cpu = i;
1078 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1079 }
1080
1081 res = transition_fid_vid(data, fid, vid);
1082 freqs.new = find_khz_freq_from_fid(data->currfid);
1083
1084 for_each_cpu(i, data->available_cores) {
1085 freqs.cpu = i;
1086 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1087 }
1088 return res;
1089}
1090
1091/* Take a frequency, and issue the hardware pstate transition command */
1092static int transition_frequency_pstate(struct powernow_k8_data *data,
1093 unsigned int index)
1094{
1095 u32 pstate = 0;
1096 int res, i;
1097 struct cpufreq_freqs freqs;
1098
1099 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1100
1101 /* get MSR index for hardware pstate transition */
1102 pstate = index & HW_PSTATE_MASK;
1103 if (pstate > data->max_hw_pstate)
1104 return 0;
1105 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1106 data->currpstate);
1107 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1108
1109 for_each_cpu(i, data->available_cores) {
1110 freqs.cpu = i;
1111 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1112 }
1113
1114 res = transition_pstate(data, pstate);
1115 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1116
1117 for_each_cpu(i, data->available_cores) {
1118 freqs.cpu = i;
1119 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1120 }
1121 return res;
1122}
1123
1124/* Driver entry point to switch to the target frequency */
1125static int powernowk8_target(struct cpufreq_policy *pol,
1126 unsigned targfreq, unsigned relation)
1127{
1128 cpumask_var_t oldmask;
1129 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1130 u32 checkfid;
1131 u32 checkvid;
1132 unsigned int newstate;
1133 int ret = -EIO;
1134
1135 if (!data)
1136 return -EINVAL;
1137
1138 checkfid = data->currfid;
1139 checkvid = data->currvid;
1140
1141 /* only run on specific CPU from here on. */
1142 /* This is poor form: use a workqueue or smp_call_function_single */
1143 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1144 return -ENOMEM;
1145
1146 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1147 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1148
1149 if (smp_processor_id() != pol->cpu) {
1150 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1151 goto err_out;
1152 }
1153
1154 if (pending_bit_stuck()) {
1155 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1156 goto err_out;
1157 }
1158
1159 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1160 pol->cpu, targfreq, pol->min, pol->max, relation);
1161
1162 if (query_current_values_with_pending_wait(data))
1163 goto err_out;
1164
1165 if (cpu_family != CPU_HW_PSTATE) {
1166 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1167 data->currfid, data->currvid);
1168
1169 if ((checkvid != data->currvid) ||
1170 (checkfid != data->currfid)) {
1171 printk(KERN_INFO PFX
1172 "error - out of sync, fix 0x%x 0x%x, "
1173 "vid 0x%x 0x%x\n",
1174 checkfid, data->currfid,
1175 checkvid, data->currvid);
1176 }
1177 }
1178
1179 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1180 targfreq, relation, &newstate))
1181 goto err_out;
1182
1183 mutex_lock(&fidvid_mutex);
1184
1185 powernow_k8_acpi_pst_values(data, newstate);
1186
1187 if (cpu_family == CPU_HW_PSTATE)
1188 ret = transition_frequency_pstate(data, newstate);
1189 else
1190 ret = transition_frequency_fidvid(data, newstate);
1191 if (ret) {
1192 printk(KERN_ERR PFX "transition frequency failed\n");
1193 ret = 1;
1194 mutex_unlock(&fidvid_mutex);
1195 goto err_out;
1196 }
1197 mutex_unlock(&fidvid_mutex);
1198
1199 if (cpu_family == CPU_HW_PSTATE)
1200 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1201 newstate);
1202 else
1203 pol->cur = find_khz_freq_from_fid(data->currfid);
1204 ret = 0;
1205
1206err_out:
1207 set_cpus_allowed_ptr(current, oldmask);
1208 free_cpumask_var(oldmask);
1209 return ret;
1210}
1211
1212/* Driver entry point to verify the policy and range of frequencies */
1213static int powernowk8_verify(struct cpufreq_policy *pol)
1214{
1215 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1216
1217 if (!data)
1218 return -EINVAL;
1219
1220 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1221}
1222
1223struct init_on_cpu {
1224 struct powernow_k8_data *data;
1225 int rc;
1226};
1227
1228static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1229{
1230 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1231
1232 if (pending_bit_stuck()) {
1233 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1234 init_on_cpu->rc = -ENODEV;
1235 return;
1236 }
1237
1238 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1239 init_on_cpu->rc = -ENODEV;
1240 return;
1241 }
1242
1243 if (cpu_family == CPU_OPTERON)
1244 fidvid_msr_init();
1245
1246 init_on_cpu->rc = 0;
1247}
1248
1249/* per CPU init entry point to the driver */
1250static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1251{
1252 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1253 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1254 FW_BUG PFX "Try again with latest BIOS.\n";
1255 struct powernow_k8_data *data;
1256 struct init_on_cpu init_on_cpu;
1257 int rc;
1258 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1259
1260 if (!cpu_online(pol->cpu))
1261 return -ENODEV;
1262
1263 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1264 if (rc)
1265 return -ENODEV;
1266
1267 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1268 if (!data) {
1269 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1270 return -ENOMEM;
1271 }
1272
1273 data->cpu = pol->cpu;
1274 data->currpstate = HW_PSTATE_INVALID;
1275
1276 if (powernow_k8_cpu_init_acpi(data)) {
1277 /*
1278 * Use the PSB BIOS structure. This is only available on
1279 * an UP version, and is deprecated by AMD.
1280 */
1281 if (num_online_cpus() != 1) {
1282 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1283 goto err_out;
1284 }
1285 if (pol->cpu != 0) {
1286 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1287 "CPU other than CPU0. Complain to your BIOS "
1288 "vendor.\n");
1289 goto err_out;
1290 }
1291 rc = find_psb_table(data);
1292 if (rc)
1293 goto err_out;
1294
1295 /* Take a crude guess here.
1296 * That guess was in microseconds, so multiply with 1000 */
1297 pol->cpuinfo.transition_latency = (
1298 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1299 ((1 << data->irt) * 30)) * 1000;
1300 } else /* ACPI _PSS objects available */
1301 pol->cpuinfo.transition_latency = get_transition_latency(data);
1302
1303 /* only run on specific CPU from here on */
1304 init_on_cpu.data = data;
1305 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1306 &init_on_cpu, 1);
1307 rc = init_on_cpu.rc;
1308 if (rc != 0)
1309 goto err_out_exit_acpi;
1310
1311 if (cpu_family == CPU_HW_PSTATE)
1312 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1313 else
1314 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1315 data->available_cores = pol->cpus;
1316
1317 if (cpu_family == CPU_HW_PSTATE)
1318 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1319 data->currpstate);
1320 else
1321 pol->cur = find_khz_freq_from_fid(data->currfid);
1322 dprintk("policy current frequency %d kHz\n", pol->cur);
1323
1324 /* min/max the cpu is capable of */
1325 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1326 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1327 powernow_k8_cpu_exit_acpi(data);
1328 kfree(data->powernow_table);
1329 kfree(data);
1330 return -EINVAL;
1331 }
1332
1333 /* Check for APERF/MPERF support in hardware */
1334 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1335 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1336
1337 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1338
1339 if (cpu_family == CPU_HW_PSTATE)
1340 dprintk("cpu_init done, current pstate 0x%x\n",
1341 data->currpstate);
1342 else
1343 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1344 data->currfid, data->currvid);
1345
1346 per_cpu(powernow_data, pol->cpu) = data;
1347
1348 return 0;
1349
1350err_out_exit_acpi:
1351 powernow_k8_cpu_exit_acpi(data);
1352
1353err_out:
1354 kfree(data);
1355 return -ENODEV;
1356}
1357
1358static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1359{
1360 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1361
1362 if (!data)
1363 return -EINVAL;
1364
1365 powernow_k8_cpu_exit_acpi(data);
1366
1367 cpufreq_frequency_table_put_attr(pol->cpu);
1368
1369 kfree(data->powernow_table);
1370 kfree(data);
1371 per_cpu(powernow_data, pol->cpu) = NULL;
1372
1373 return 0;
1374}
1375
1376static void query_values_on_cpu(void *_err)
1377{
1378 int *err = _err;
1379 struct powernow_k8_data *data = __this_cpu_read(powernow_data);
1380
1381 *err = query_current_values_with_pending_wait(data);
1382}
1383
1384static unsigned int powernowk8_get(unsigned int cpu)
1385{
1386 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1387 unsigned int khz = 0;
1388 int err;
1389
1390 if (!data)
1391 return 0;
1392
1393 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1394 if (err)
1395 goto out;
1396
1397 if (cpu_family == CPU_HW_PSTATE)
1398 khz = find_khz_freq_from_pstate(data->powernow_table,
1399 data->currpstate);
1400 else
1401 khz = find_khz_freq_from_fid(data->currfid);
1402
1403
1404out:
1405 return khz;
1406}
1407
1408static void _cpb_toggle_msrs(bool t)
1409{
1410 int cpu;
1411
1412 get_online_cpus();
1413
1414 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1415
1416 for_each_cpu(cpu, cpu_online_mask) {
1417 struct msr *reg = per_cpu_ptr(msrs, cpu);
1418 if (t)
1419 reg->l &= ~BIT(25);
1420 else
1421 reg->l |= BIT(25);
1422 }
1423 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1424
1425 put_online_cpus();
1426}
1427
1428/*
1429 * Switch on/off core performance boosting.
1430 *
1431 * 0=disable
1432 * 1=enable.
1433 */
1434static void cpb_toggle(bool t)
1435{
1436 if (!cpb_capable)
1437 return;
1438
1439 if (t && !cpb_enabled) {
1440 cpb_enabled = true;
1441 _cpb_toggle_msrs(t);
1442 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1443 } else if (!t && cpb_enabled) {
1444 cpb_enabled = false;
1445 _cpb_toggle_msrs(t);
1446 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1447 }
1448}
1449
1450static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1451 size_t count)
1452{
1453 int ret = -EINVAL;
1454 unsigned long val = 0;
1455
1456 ret = strict_strtoul(buf, 10, &val);
1457 if (!ret && (val == 0 || val == 1) && cpb_capable)
1458 cpb_toggle(val);
1459 else
1460 return -EINVAL;
1461
1462 return count;
1463}
1464
1465static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1466{
1467 return sprintf(buf, "%u\n", cpb_enabled);
1468}
1469
1470#define define_one_rw(_name) \
1471static struct freq_attr _name = \
1472__ATTR(_name, 0644, show_##_name, store_##_name)
1473
1474define_one_rw(cpb);
1475
1476static struct freq_attr *powernow_k8_attr[] = {
1477 &cpufreq_freq_attr_scaling_available_freqs,
1478 &cpb,
1479 NULL,
1480};
1481
1482static struct cpufreq_driver cpufreq_amd64_driver = {
1483 .verify = powernowk8_verify,
1484 .target = powernowk8_target,
1485 .bios_limit = acpi_processor_get_bios_limit,
1486 .init = powernowk8_cpu_init,
1487 .exit = __devexit_p(powernowk8_cpu_exit),
1488 .get = powernowk8_get,
1489 .name = "powernow-k8",
1490 .owner = THIS_MODULE,
1491 .attr = powernow_k8_attr,
1492};
1493
1494/*
1495 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1496 * cannot block the remaining ones from boosting. On the CPU_UP path we
1497 * simply keep the boost-disable flag in sync with the current global
1498 * state.
1499 */
1500static int cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu)
1502{
1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi;
1505
1506 switch (action) {
1507 case CPU_UP_PREPARE:
1508 case CPU_UP_PREPARE_FROZEN:
1509
1510 if (!cpb_enabled) {
1511 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1512 lo |= BIT(25);
1513 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1514 }
1515 break;
1516
1517 case CPU_DOWN_PREPARE:
1518 case CPU_DOWN_PREPARE_FROZEN:
1519 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1520 lo &= ~BIT(25);
1521 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1522 break;
1523
1524 default:
1525 break;
1526 }
1527
1528 return NOTIFY_OK;
1529}
1530
1531static struct notifier_block cpb_nb = {
1532 .notifier_call = cpb_notify,
1533};
1534
1535/* driver entry point for init */
1536static int __cpuinit powernowk8_init(void)
1537{
1538 unsigned int i, supported_cpus = 0, cpu;
1539 int rv;
1540
1541 for_each_online_cpu(i) {
1542 int rc;
1543 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1544 if (rc == 0)
1545 supported_cpus++;
1546 }
1547
1548 if (supported_cpus != num_online_cpus())
1549 return -ENODEV;
1550
1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 msrs = msrs_alloc();
1559 if (!msrs) {
1560 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1561 return -ENOMEM;
1562 }
1563
1564 register_cpu_notifier(&cpb_nb);
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1575 }
1576
1577 rv = cpufreq_register_driver(&cpufreq_amd64_driver);
1578 if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
1579 unregister_cpu_notifier(&cpb_nb);
1580 msrs_free(msrs);
1581 msrs = NULL;
1582 }
1583 return rv;
1584}
1585
1586/* driver entry point for term */
1587static void __exit powernowk8_exit(void)
1588{
1589 dprintk("exit\n");
1590
1591 if (boot_cpu_has(X86_FEATURE_CPB)) {
1592 msrs_free(msrs);
1593 msrs = NULL;
1594
1595 unregister_cpu_notifier(&cpb_nb);
1596 }
1597
1598 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1599}
1600
1601MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1602 "Mark Langsdorf <mark.langsdorf@amd.com>");
1603MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1604MODULE_LICENSE("GPL");
1605
1606late_initcall(powernowk8_init);
1607module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8enum pstate {
9 HW_PSTATE_INVALID = 0xff,
10 HW_PSTATE_0 = 0,
11 HW_PSTATE_1 = 1,
12 HW_PSTATE_2 = 2,
13 HW_PSTATE_3 = 3,
14 HW_PSTATE_4 = 4,
15 HW_PSTATE_5 = 5,
16 HW_PSTATE_6 = 6,
17 HW_PSTATE_7 = 7,
18};
19
20struct powernow_k8_data {
21 unsigned int cpu;
22
23 u32 numps; /* number of p-states */
24 u32 batps; /* number of p-states supported on battery */
25 u32 max_hw_pstate; /* maximum legal hardware pstate */
26
27 /* these values are constant when the PSB is used to determine
28 * vid/fid pairings, but are modified during the ->target() call
29 * when ACPI is used */
30 u32 rvo; /* ramp voltage offset */
31 u32 irt; /* isochronous relief time */
32 u32 vidmvs; /* usable value calculated from mvs */
33 u32 vstable; /* voltage stabilization time, units 20 us */
34 u32 plllock; /* pll lock time, units 1 us */
35 u32 exttype; /* extended interface = 1 */
36
37 /* keep track of the current fid / vid or pstate */
38 u32 currvid;
39 u32 currfid;
40 enum pstate currpstate;
41
42 /* the powernow_table includes all frequency and vid/fid pairings:
43 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
44 * frequency is in kHz */
45 struct cpufreq_frequency_table *powernow_table;
46
47 /* the acpi table needs to be kept. it's only available if ACPI was
48 * used to determine valid frequency/vid/fid states */
49 struct acpi_processor_performance acpi_data;
50
51 /* we need to keep track of associated cores, but let cpufreq
52 * handle hotplug events - so just point at cpufreq pol->cpus
53 * structure */
54 struct cpumask *available_cores;
55};
56
57/* processor's cpuid instruction support */
58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
59#define CPUID_XFAM 0x0ff00000 /* extended family */
60#define CPUID_XFAM_K8 0
61#define CPUID_XMOD 0x000f0000 /* extended model */
62#define CPUID_XMOD_REV_MASK 0x000c0000
63#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
64#define CPUID_USE_XFAM_XMOD 0x00000f00
65#define CPUID_GET_MAX_CAPABILITIES 0x80000000
66#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
67#define P_STATE_TRANSITION_CAPABLE 6
68
69/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
70/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
71/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
72/* the register number is placed in ecx, and the data is returned in edx:eax. */
73
74#define MSR_FIDVID_CTL 0xc0010041
75#define MSR_FIDVID_STATUS 0xc0010042
76
77/* Field definitions within the FID VID Low Control MSR : */
78#define MSR_C_LO_INIT_FID_VID 0x00010000
79#define MSR_C_LO_NEW_VID 0x00003f00
80#define MSR_C_LO_NEW_FID 0x0000003f
81#define MSR_C_LO_VID_SHIFT 8
82
83/* Field definitions within the FID VID High Control MSR : */
84#define MSR_C_HI_STP_GNT_TO 0x000fffff
85
86/* Field definitions within the FID VID Low Status MSR : */
87#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
88#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
89#define MSR_S_LO_MAX_FID 0x003f0000
90#define MSR_S_LO_START_FID 0x00003f00
91#define MSR_S_LO_CURRENT_FID 0x0000003f
92
93/* Field definitions within the FID VID High Status MSR : */
94#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
95#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
96#define MSR_S_HI_START_VID 0x00003f00
97#define MSR_S_HI_CURRENT_VID 0x0000003f
98#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
99
100
101/* Hardware Pstate _PSS and MSR definitions */
102#define USE_HW_PSTATE 0x00000080
103#define HW_PSTATE_MASK 0x00000007
104#define HW_PSTATE_VALID_MASK 0x80000000
105#define HW_PSTATE_MAX_MASK 0x000000f0
106#define HW_PSTATE_MAX_SHIFT 4
107#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
108#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
109#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
110#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
111
112/* define the two driver architectures */
113#define CPU_OPTERON 0
114#define CPU_HW_PSTATE 1
115
116
117/*
118 * There are restrictions frequencies have to follow:
119 * - only 1 entry in the low fid table ( <=1.4GHz )
120 * - lowest entry in the high fid table must be >= 2 * the entry in the
121 * low fid table
122 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
123 * in the low fid table
124 * - the parts can only step at <= 200 MHz intervals, odd fid values are
125 * supported in revision G and later revisions.
126 * - lowest frequency must be >= interprocessor hypertransport link speed
127 * (only applies to MP systems obviously)
128 */
129
130/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
131#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
132#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
133
134#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
135#define HI_VCOFREQ_TABLE_BOTTOM 1600
136
137#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
138
139#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
140#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
141
142#define MIN_FREQ 800 /* Min and max freqs, per spec */
143#define MAX_FREQ 5000
144
145#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
146#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
147
148#define VID_OFF 0x3f
149
150#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
151
152#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
153
154#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
155#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
156
157/*
158 * Most values of interest are encoded in a single field of the _PSS
159 * entries: the "control" value.
160 */
161
162#define IRT_SHIFT 30
163#define RVO_SHIFT 28
164#define EXT_TYPE_SHIFT 27
165#define PLL_L_SHIFT 20
166#define MVS_SHIFT 18
167#define VST_SHIFT 11
168#define VID_SHIFT 6
169#define IRT_MASK 3
170#define RVO_MASK 3
171#define EXT_TYPE_MASK 1
172#define PLL_L_MASK 0x7f
173#define MVS_MASK 3
174#define VST_MASK 0x7f
175#define VID_MASK 0x1f
176#define FID_MASK 0x1f
177#define EXT_VID_MASK 0x3f
178#define EXT_FID_MASK 0x3f
179
180
181/*
182 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
183 * to tell the OS's power management driver which VIDs and FIDs are
184 * supported by this particular processor.
185 * If the data in the PSB / PST is wrong, then this driver will program the
186 * wrong values into hardware, which is very likely to lead to a crash.
187 */
188
189#define PSB_ID_STRING "AMDK7PNOW!"
190#define PSB_ID_STRING_LEN 10
191
192#define PSB_VERSION_1_4 0x14
193
194struct psb_s {
195 u8 signature[10];
196 u8 tableversion;
197 u8 flags1;
198 u16 vstable;
199 u8 flags2;
200 u8 num_tables;
201 u32 cpuid;
202 u8 plllocktime;
203 u8 maxfid;
204 u8 maxvid;
205 u8 numps;
206};
207
208/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
209struct pst_s {
210 u8 fid;
211 u8 vid;
212};
213
214#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
215
216static int core_voltage_pre_transition(struct powernow_k8_data *data,
217 u32 reqvid, u32 regfid);
218static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
219static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
220
221static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
222
223static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
224static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/*
2 * sc520_freq.c: cpufreq driver for the AMD Elan sc520
3 *
4 * Copyright (C) 2005 Sean Young <sean@mess.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Based on elanfreq.c
12 *
13 * 2005-03-30: - initial revision
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19
20#include <linux/delay.h>
21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
24
25#include <asm/msr.h>
26
27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29
30static __u8 __iomem *cpuctl;
31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
35
36static struct cpufreq_frequency_table sc520_freq_table[] = {
37 {0x01, 100000},
38 {0x02, 133000},
39 {0, CPUFREQ_TABLE_END},
40};
41
42static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43{
44 u8 clockspeed_reg = *cpuctl;
45
46 switch (clockspeed_reg & 0x03) {
47 default:
48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
50 case 0x01:
51 return 100000;
52 case 0x02:
53 return 133000;
54 }
55}
56
57static void sc520_freq_set_cpu_state(unsigned int state)
58{
59
60 struct cpufreq_freqs freqs;
61 u8 clockspeed_reg;
62
63 freqs.old = sc520_freq_get_cpu_frequency(0);
64 freqs.new = sc520_freq_table[state].frequency;
65 freqs.cpu = 0; /* AMD Elan is UP */
66
67 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
68
69 dprintk("attempting to set frequency to %i kHz\n",
70 sc520_freq_table[state].frequency);
71
72 local_irq_disable();
73
74 clockspeed_reg = *cpuctl & ~0x03;
75 *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
76
77 local_irq_enable();
78
79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
80};
81
82static int sc520_freq_verify(struct cpufreq_policy *policy)
83{
84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
85}
86
87static int sc520_freq_target(struct cpufreq_policy *policy,
88 unsigned int target_freq,
89 unsigned int relation)
90{
91 unsigned int newstate = 0;
92
93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
95 return -EINVAL;
96
97 sc520_freq_set_cpu_state(newstate);
98
99 return 0;
100}
101
102
103/*
104 * Module init and exit code
105 */
106
107static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
108{
109 struct cpuinfo_x86 *c = &cpu_data(0);
110 int result;
111
112 /* capability check */
113 if (c->x86_vendor != X86_VENDOR_AMD ||
114 c->x86 != 4 || c->x86_model != 9)
115 return -ENODEV;
116
117 /* cpuinfo and default policy values */
118 policy->cpuinfo.transition_latency = 1000000; /* 1ms */
119 policy->cur = sc520_freq_get_cpu_frequency(0);
120
121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
122 if (result)
123 return result;
124
125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
126
127 return 0;
128}
129
130
131static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
132{
133 cpufreq_frequency_table_put_attr(policy->cpu);
134 return 0;
135}
136
137
138static struct freq_attr *sc520_freq_attr[] = {
139 &cpufreq_freq_attr_scaling_available_freqs,
140 NULL,
141};
142
143
144static struct cpufreq_driver sc520_freq_driver = {
145 .get = sc520_freq_get_cpu_frequency,
146 .verify = sc520_freq_verify,
147 .target = sc520_freq_target,
148 .init = sc520_freq_cpu_init,
149 .exit = sc520_freq_cpu_exit,
150 .name = "sc520_freq",
151 .owner = THIS_MODULE,
152 .attr = sc520_freq_attr,
153};
154
155
156static int __init sc520_freq_init(void)
157{
158 struct cpuinfo_x86 *c = &cpu_data(0);
159 int err;
160
161 /* Test if we have the right hardware */
162 if (c->x86_vendor != X86_VENDOR_AMD ||
163 c->x86 != 4 || c->x86_model != 9) {
164 dprintk("no Elan SC520 processor found!\n");
165 return -ENODEV;
166 }
167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
168 if (!cpuctl) {
169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
170 return -ENOMEM;
171 }
172
173 err = cpufreq_register_driver(&sc520_freq_driver);
174 if (err)
175 iounmap(cpuctl);
176
177 return err;
178}
179
180
181static void __exit sc520_freq_exit(void)
182{
183 cpufreq_unregister_driver(&sc520_freq_driver);
184 iounmap(cpuctl);
185}
186
187
188MODULE_LICENSE("GPL");
189MODULE_AUTHOR("Sean Young <sean@mess.org>");
190MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
191
192module_init(sc520_freq_init);
193module_exit(sc520_freq_exit);
194
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Since the original Pentium M, most new Intel CPUs support Enhanced
6 * SpeedStep.
7 *
8 * Despite the "SpeedStep" in the name, this is almost entirely unlike
9 * traditional SpeedStep.
10 *
11 * Modelled on speedstep.c
12 *
13 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/sched.h> /* current */
21#include <linux/delay.h>
22#include <linux/compiler.h>
23#include <linux/gfp.h>
24
25#include <asm/msr.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
28
29#define PFX "speedstep-centrino: "
30#define MAINTAINER "cpufreq@vger.kernel.org"
31
32#define dprintk(msg...) \
33 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
34
35#define INTEL_MSR_RANGE (0xffff)
36
37struct cpu_id
38{
39 __u8 x86; /* CPU family */
40 __u8 x86_model; /* model */
41 __u8 x86_mask; /* stepping */
42};
43
44enum {
45 CPU_BANIAS,
46 CPU_DOTHAN_A1,
47 CPU_DOTHAN_A2,
48 CPU_DOTHAN_B0,
49 CPU_MP4HT_D0,
50 CPU_MP4HT_E0,
51};
52
53static const struct cpu_id cpu_ids[] = {
54 [CPU_BANIAS] = { 6, 9, 5 },
55 [CPU_DOTHAN_A1] = { 6, 13, 1 },
56 [CPU_DOTHAN_A2] = { 6, 13, 2 },
57 [CPU_DOTHAN_B0] = { 6, 13, 6 },
58 [CPU_MP4HT_D0] = {15, 3, 4 },
59 [CPU_MP4HT_E0] = {15, 4, 1 },
60};
61#define N_IDS ARRAY_SIZE(cpu_ids)
62
63struct cpu_model
64{
65 const struct cpu_id *cpu_id;
66 const char *model_name;
67 unsigned max_freq; /* max clock in kHz */
68
69 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
70};
71static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
72 const struct cpu_id *x);
73
74/* Operating points for current CPU */
75static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
76static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
77
78static struct cpufreq_driver centrino_driver;
79
80#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
81
82/* Computes the correct form for IA32_PERF_CTL MSR for a particular
83 frequency/voltage operating point; frequency in MHz, volts in mV.
84 This is stored as "index" in the structure. */
85#define OP(mhz, mv) \
86 { \
87 .frequency = (mhz) * 1000, \
88 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
89 }
90
91/*
92 * These voltage tables were derived from the Intel Pentium M
93 * datasheet, document 25261202.pdf, Table 5. I have verified they
94 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
95 * M.
96 */
97
98/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
99static struct cpufreq_frequency_table banias_900[] =
100{
101 OP(600, 844),
102 OP(800, 988),
103 OP(900, 1004),
104 { .frequency = CPUFREQ_TABLE_END }
105};
106
107/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
108static struct cpufreq_frequency_table banias_1000[] =
109{
110 OP(600, 844),
111 OP(800, 972),
112 OP(900, 988),
113 OP(1000, 1004),
114 { .frequency = CPUFREQ_TABLE_END }
115};
116
117/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
118static struct cpufreq_frequency_table banias_1100[] =
119{
120 OP( 600, 956),
121 OP( 800, 1020),
122 OP( 900, 1100),
123 OP(1000, 1164),
124 OP(1100, 1180),
125 { .frequency = CPUFREQ_TABLE_END }
126};
127
128
129/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
130static struct cpufreq_frequency_table banias_1200[] =
131{
132 OP( 600, 956),
133 OP( 800, 1004),
134 OP( 900, 1020),
135 OP(1000, 1100),
136 OP(1100, 1164),
137 OP(1200, 1180),
138 { .frequency = CPUFREQ_TABLE_END }
139};
140
141/* Intel Pentium M processor 1.30GHz (Banias) */
142static struct cpufreq_frequency_table banias_1300[] =
143{
144 OP( 600, 956),
145 OP( 800, 1260),
146 OP(1000, 1292),
147 OP(1200, 1356),
148 OP(1300, 1388),
149 { .frequency = CPUFREQ_TABLE_END }
150};
151
152/* Intel Pentium M processor 1.40GHz (Banias) */
153static struct cpufreq_frequency_table banias_1400[] =
154{
155 OP( 600, 956),
156 OP( 800, 1180),
157 OP(1000, 1308),
158 OP(1200, 1436),
159 OP(1400, 1484),
160 { .frequency = CPUFREQ_TABLE_END }
161};
162
163/* Intel Pentium M processor 1.50GHz (Banias) */
164static struct cpufreq_frequency_table banias_1500[] =
165{
166 OP( 600, 956),
167 OP( 800, 1116),
168 OP(1000, 1228),
169 OP(1200, 1356),
170 OP(1400, 1452),
171 OP(1500, 1484),
172 { .frequency = CPUFREQ_TABLE_END }
173};
174
175/* Intel Pentium M processor 1.60GHz (Banias) */
176static struct cpufreq_frequency_table banias_1600[] =
177{
178 OP( 600, 956),
179 OP( 800, 1036),
180 OP(1000, 1164),
181 OP(1200, 1276),
182 OP(1400, 1420),
183 OP(1600, 1484),
184 { .frequency = CPUFREQ_TABLE_END }
185};
186
187/* Intel Pentium M processor 1.70GHz (Banias) */
188static struct cpufreq_frequency_table banias_1700[] =
189{
190 OP( 600, 956),
191 OP( 800, 1004),
192 OP(1000, 1116),
193 OP(1200, 1228),
194 OP(1400, 1308),
195 OP(1700, 1484),
196 { .frequency = CPUFREQ_TABLE_END }
197};
198#undef OP
199
200#define _BANIAS(cpuid, max, name) \
201{ .cpu_id = cpuid, \
202 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
203 .max_freq = (max)*1000, \
204 .op_points = banias_##max, \
205}
206#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
207
208/* CPU models, their operating frequency range, and freq/voltage
209 operating points */
210static struct cpu_model models[] =
211{
212 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
213 BANIAS(1000),
214 BANIAS(1100),
215 BANIAS(1200),
216 BANIAS(1300),
217 BANIAS(1400),
218 BANIAS(1500),
219 BANIAS(1600),
220 BANIAS(1700),
221
222 /* NULL model_name is a wildcard */
223 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
224 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
225 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
226 { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
227 { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
228
229 { NULL, }
230};
231#undef _BANIAS
232#undef BANIAS
233
234static int centrino_cpu_init_table(struct cpufreq_policy *policy)
235{
236 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
237 struct cpu_model *model;
238
239 for(model = models; model->cpu_id != NULL; model++)
240 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
241 (model->model_name == NULL ||
242 strcmp(cpu->x86_model_id, model->model_name) == 0))
243 break;
244
245 if (model->cpu_id == NULL) {
246 /* No match at all */
247 dprintk("no support for CPU model \"%s\": "
248 "send /proc/cpuinfo to " MAINTAINER "\n",
249 cpu->x86_model_id);
250 return -ENOENT;
251 }
252
253 if (model->op_points == NULL) {
254 /* Matched a non-match */
255 dprintk("no table support for CPU model \"%s\"\n",
256 cpu->x86_model_id);
257 dprintk("try using the acpi-cpufreq driver\n");
258 return -ENOENT;
259 }
260
261 per_cpu(centrino_model, policy->cpu) = model;
262
263 dprintk("found \"%s\": max frequency: %dkHz\n",
264 model->model_name, model->max_freq);
265
266 return 0;
267}
268
269#else
270static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
271{
272 return -ENODEV;
273}
274#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
275
276static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
277 const struct cpu_id *x)
278{
279 if ((c->x86 == x->x86) &&
280 (c->x86_model == x->x86_model) &&
281 (c->x86_mask == x->x86_mask))
282 return 1;
283 return 0;
284}
285
286/* To be called only after centrino_model is initialized */
287static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
288{
289 int i;
290
291 /*
292 * Extract clock in kHz from PERF_CTL value
293 * for centrino, as some DSDTs are buggy.
294 * Ideally, this can be done using the acpi_data structure.
295 */
296 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
298 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
299 msr = (msr >> 8) & 0xff;
300 return msr * 100000;
301 }
302
303 if ((!per_cpu(centrino_model, cpu)) ||
304 (!per_cpu(centrino_model, cpu)->op_points))
305 return 0;
306
307 msr &= 0xffff;
308 for (i = 0;
309 per_cpu(centrino_model, cpu)->op_points[i].frequency
310 != CPUFREQ_TABLE_END;
311 i++) {
312 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
313 return per_cpu(centrino_model, cpu)->
314 op_points[i].frequency;
315 }
316 if (failsafe)
317 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
318 else
319 return 0;
320}
321
322/* Return the current CPU frequency in kHz */
323static unsigned int get_cur_freq(unsigned int cpu)
324{
325 unsigned l, h;
326 unsigned clock_freq;
327
328 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 clock_freq = extract_clock(l, cpu, 0);
330
331 if (unlikely(clock_freq == 0)) {
332 /*
333 * On some CPUs, we can see transient MSR values (which are
334 * not present in _PSS), while CPU is doing some automatic
335 * P-state transition (like TM2). Get the last freq set
336 * in PERF_CTL.
337 */
338 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
339 clock_freq = extract_clock(l, cpu, 1);
340 }
341 return clock_freq;
342}
343
344
345static int centrino_cpu_init(struct cpufreq_policy *policy)
346{
347 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
348 unsigned freq;
349 unsigned l, h;
350 int ret;
351 int i;
352
353 /* Only Intel makes Enhanced Speedstep-capable CPUs */
354 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
355 !cpu_has(cpu, X86_FEATURE_EST))
356 return -ENODEV;
357
358 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
359 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
360
361 if (policy->cpu != 0)
362 return -ENODEV;
363
364 for (i = 0; i < N_IDS; i++)
365 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
366 break;
367
368 if (i != N_IDS)
369 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
370
371 if (!per_cpu(centrino_cpu, policy->cpu)) {
372 dprintk("found unsupported CPU with "
373 "Enhanced SpeedStep: send /proc/cpuinfo to "
374 MAINTAINER "\n");
375 return -ENODEV;
376 }
377
378 if (centrino_cpu_init_table(policy)) {
379 return -ENODEV;
380 }
381
382 /* Check to see if Enhanced SpeedStep is enabled, and try to
383 enable it if not. */
384 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
385
386 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
387 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
388 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
389 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
390
391 /* check to see if it stuck */
392 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 printk(KERN_INFO PFX
395 "couldn't enable Enhanced SpeedStep\n");
396 return -ENODEV;
397 }
398 }
399
400 freq = get_cur_freq(policy->cpu);
401 policy->cpuinfo.transition_latency = 10000;
402 /* 10uS transition latency */
403 policy->cur = freq;
404
405 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
406
407 ret = cpufreq_frequency_table_cpuinfo(policy,
408 per_cpu(centrino_model, policy->cpu)->op_points);
409 if (ret)
410 return (ret);
411
412 cpufreq_frequency_table_get_attr(
413 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
414
415 return 0;
416}
417
418static int centrino_cpu_exit(struct cpufreq_policy *policy)
419{
420 unsigned int cpu = policy->cpu;
421
422 if (!per_cpu(centrino_model, cpu))
423 return -ENODEV;
424
425 cpufreq_frequency_table_put_attr(cpu);
426
427 per_cpu(centrino_model, cpu) = NULL;
428
429 return 0;
430}
431
432/**
433 * centrino_verify - verifies a new CPUFreq policy
434 * @policy: new policy
435 *
436 * Limit must be within this model's frequency range at least one
437 * border included.
438 */
439static int centrino_verify (struct cpufreq_policy *policy)
440{
441 return cpufreq_frequency_table_verify(policy,
442 per_cpu(centrino_model, policy->cpu)->op_points);
443}
444
445/**
446 * centrino_setpolicy - set a new CPUFreq policy
447 * @policy: new policy
448 * @target_freq: the target frequency
449 * @relation: how that frequency relates to achieved frequency
450 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
451 *
452 * Sets a new CPUFreq policy.
453 */
454static int centrino_target (struct cpufreq_policy *policy,
455 unsigned int target_freq,
456 unsigned int relation)
457{
458 unsigned int newstate = 0;
459 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
460 struct cpufreq_freqs freqs;
461 int retval = 0;
462 unsigned int j, k, first_cpu, tmp;
463 cpumask_var_t covered_cpus;
464
465 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
466 return -ENOMEM;
467
468 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
469 retval = -ENODEV;
470 goto out;
471 }
472
473 if (unlikely(cpufreq_frequency_table_target(policy,
474 per_cpu(centrino_model, cpu)->op_points,
475 target_freq,
476 relation,
477 &newstate))) {
478 retval = -EINVAL;
479 goto out;
480 }
481
482 first_cpu = 1;
483 for_each_cpu(j, policy->cpus) {
484 int good_cpu;
485
486 /* cpufreq holds the hotplug lock, so we are safe here */
487 if (!cpu_online(j))
488 continue;
489
490 /*
491 * Support for SMP systems.
492 * Make sure we are running on CPU that wants to change freq
493 */
494 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
495 good_cpu = cpumask_any_and(policy->cpus,
496 cpu_online_mask);
497 else
498 good_cpu = j;
499
500 if (good_cpu >= nr_cpu_ids) {
501 dprintk("couldn't limit to CPUs in this domain\n");
502 retval = -EAGAIN;
503 if (first_cpu) {
504 /* We haven't started the transition yet. */
505 goto out;
506 }
507 break;
508 }
509
510 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
511
512 if (first_cpu) {
513 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
514 if (msr == (oldmsr & 0xffff)) {
515 dprintk("no change needed - msr was and needs "
516 "to be %x\n", oldmsr);
517 retval = 0;
518 goto out;
519 }
520
521 freqs.old = extract_clock(oldmsr, cpu, 0);
522 freqs.new = extract_clock(msr, cpu, 0);
523
524 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
525 target_freq, freqs.old, freqs.new, msr);
526
527 for_each_cpu(k, policy->cpus) {
528 if (!cpu_online(k))
529 continue;
530 freqs.cpu = k;
531 cpufreq_notify_transition(&freqs,
532 CPUFREQ_PRECHANGE);
533 }
534
535 first_cpu = 0;
536 /* all but 16 LSB are reserved, treat them with care */
537 oldmsr &= ~0xffff;
538 msr &= 0xffff;
539 oldmsr |= msr;
540 }
541
542 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
543 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
544 break;
545
546 cpumask_set_cpu(j, covered_cpus);
547 }
548
549 for_each_cpu(k, policy->cpus) {
550 if (!cpu_online(k))
551 continue;
552 freqs.cpu = k;
553 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
554 }
555
556 if (unlikely(retval)) {
557 /*
558 * We have failed halfway through the frequency change.
559 * We have sent callbacks to policy->cpus and
560 * MSRs have already been written on coverd_cpus.
561 * Best effort undo..
562 */
563
564 for_each_cpu(j, covered_cpus)
565 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
566
567 tmp = freqs.new;
568 freqs.new = freqs.old;
569 freqs.old = tmp;
570 for_each_cpu(j, policy->cpus) {
571 if (!cpu_online(j))
572 continue;
573 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
574 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
575 }
576 }
577 retval = 0;
578
579out:
580 free_cpumask_var(covered_cpus);
581 return retval;
582}
583
584static struct freq_attr* centrino_attr[] = {
585 &cpufreq_freq_attr_scaling_available_freqs,
586 NULL,
587};
588
589static struct cpufreq_driver centrino_driver = {
590 .name = "centrino", /* should be speedstep-centrino,
591 but there's a 16 char limit */
592 .init = centrino_cpu_init,
593 .exit = centrino_cpu_exit,
594 .verify = centrino_verify,
595 .target = centrino_target,
596 .get = get_cur_freq,
597 .attr = centrino_attr,
598 .owner = THIS_MODULE,
599};
600
601
602/**
603 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
604 *
605 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
606 * unsupported devices, -ENOENT if there's no voltage table for this
607 * particular CPU model, -EINVAL on problems during initiatization,
608 * and zero on success.
609 *
610 * This is quite picky. Not only does the CPU have to advertise the
611 * "est" flag in the cpuid capability flags, we look for a specific
612 * CPU model and stepping, and we need to have the exact model name in
613 * our voltage tables. That is, be paranoid about not releasing
614 * someone's valuable magic smoke.
615 */
616static int __init centrino_init(void)
617{
618 struct cpuinfo_x86 *cpu = &cpu_data(0);
619
620 if (!cpu_has(cpu, X86_FEATURE_EST))
621 return -ENODEV;
622
623 return cpufreq_register_driver(&centrino_driver);
624}
625
626static void __exit centrino_exit(void)
627{
628 cpufreq_unregister_driver(&centrino_driver);
629}
630
631MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
632MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
633MODULE_LICENSE ("GPL");
634
635late_initcall(centrino_init);
636module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/sched.h>
27
28#include "speedstep-lib.h"
29
30
31/* speedstep_chipset:
32 * It is necessary to know which chipset is used. As accesses to
33 * this device occur at various places in this module, we need a
34 * static struct pci_dev * pointing to that device.
35 */
36static struct pci_dev *speedstep_chipset_dev;
37
38
39/* speedstep_processor
40 */
41static enum speedstep_processor speedstep_processor;
42
43static u32 pmbase;
44
45/*
46 * There are only two frequency states for each processor. Values
47 * are in kHz for the time being.
48 */
49static struct cpufreq_frequency_table speedstep_freqs[] = {
50 {SPEEDSTEP_HIGH, 0},
51 {SPEEDSTEP_LOW, 0},
52 {0, CPUFREQ_TABLE_END},
53};
54
55
56#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
57 "speedstep-ich", msg)
58
59
60/**
61 * speedstep_find_register - read the PMBASE address
62 *
63 * Returns: -ENODEV if no register could be found
64 */
65static int speedstep_find_register(void)
66{
67 if (!speedstep_chipset_dev)
68 return -ENODEV;
69
70 /* get PMBASE */
71 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
72 if (!(pmbase & 0x01)) {
73 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
74 return -ENODEV;
75 }
76
77 pmbase &= 0xFFFFFFFE;
78 if (!pmbase) {
79 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
80 return -ENODEV;
81 }
82
83 dprintk("pmbase is 0x%x\n", pmbase);
84 return 0;
85}
86
87/**
88 * speedstep_set_state - set the SpeedStep state
89 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
90 *
91 * Tries to change the SpeedStep state. Can be called from
92 * smp_call_function_single.
93 */
94static void speedstep_set_state(unsigned int state)
95{
96 u8 pm2_blk;
97 u8 value;
98 unsigned long flags;
99
100 if (state > 0x1)
101 return;
102
103 /* Disable IRQs */
104 local_irq_save(flags);
105
106 /* read state */
107 value = inb(pmbase + 0x50);
108
109 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
110
111 /* write new state */
112 value &= 0xFE;
113 value |= state;
114
115 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
116
117 /* Disable bus master arbitration */
118 pm2_blk = inb(pmbase + 0x20);
119 pm2_blk |= 0x01;
120 outb(pm2_blk, (pmbase + 0x20));
121
122 /* Actual transition */
123 outb(value, (pmbase + 0x50));
124
125 /* Restore bus master arbitration */
126 pm2_blk &= 0xfe;
127 outb(pm2_blk, (pmbase + 0x20));
128
129 /* check if transition was successful */
130 value = inb(pmbase + 0x50);
131
132 /* Enable IRQs */
133 local_irq_restore(flags);
134
135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
136
137 if (state == (value & 0x1))
138 dprintk("change to %u MHz succeeded\n",
139 speedstep_get_frequency(speedstep_processor) / 1000);
140 else
141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
142
143 return;
144}
145
146/* Wrapper for smp_call_function_single. */
147static void _speedstep_set_state(void *_state)
148{
149 speedstep_set_state(*(unsigned int *)_state);
150}
151
152/**
153 * speedstep_activate - activate SpeedStep control in the chipset
154 *
155 * Tries to activate the SpeedStep status and control registers.
156 * Returns -EINVAL on an unsupported chipset, and zero on success.
157 */
158static int speedstep_activate(void)
159{
160 u16 value = 0;
161
162 if (!speedstep_chipset_dev)
163 return -EINVAL;
164
165 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
166 if (!(value & 0x08)) {
167 value |= 0x08;
168 dprintk("activating SpeedStep (TM) registers\n");
169 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
170 }
171
172 return 0;
173}
174
175
176/**
177 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
178 *
179 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
180 * the LPC bridge / PM module which contains all power-management
181 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
182 * chipset, or zero on failure.
183 */
184static unsigned int speedstep_detect_chipset(void)
185{
186 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
187 PCI_DEVICE_ID_INTEL_82801DB_12,
188 PCI_ANY_ID, PCI_ANY_ID,
189 NULL);
190 if (speedstep_chipset_dev)
191 return 4; /* 4-M */
192
193 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
194 PCI_DEVICE_ID_INTEL_82801CA_12,
195 PCI_ANY_ID, PCI_ANY_ID,
196 NULL);
197 if (speedstep_chipset_dev)
198 return 3; /* 3-M */
199
200
201 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
202 PCI_DEVICE_ID_INTEL_82801BA_10,
203 PCI_ANY_ID, PCI_ANY_ID,
204 NULL);
205 if (speedstep_chipset_dev) {
206 /* speedstep.c causes lockups on Dell Inspirons 8000 and
207 * 8100 which use a pretty old revision of the 82815
208 * host brige. Abort on these systems.
209 */
210 static struct pci_dev *hostbridge;
211
212 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
213 PCI_DEVICE_ID_INTEL_82815_MC,
214 PCI_ANY_ID, PCI_ANY_ID,
215 NULL);
216
217 if (!hostbridge)
218 return 2; /* 2-M */
219
220 if (hostbridge->revision < 5) {
221 dprintk("hostbridge does not support speedstep\n");
222 speedstep_chipset_dev = NULL;
223 pci_dev_put(hostbridge);
224 return 0;
225 }
226
227 pci_dev_put(hostbridge);
228 return 2; /* 2-M */
229 }
230
231 return 0;
232}
233
234static void get_freq_data(void *_speed)
235{
236 unsigned int *speed = _speed;
237
238 *speed = speedstep_get_frequency(speedstep_processor);
239}
240
241static unsigned int speedstep_get(unsigned int cpu)
242{
243 unsigned int speed;
244
245 /* You're supposed to ensure CPU is online. */
246 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
247 BUG();
248
249 dprintk("detected %u kHz as current frequency\n", speed);
250 return speed;
251}
252
253/**
254 * speedstep_target - set a new CPUFreq policy
255 * @policy: new policy
256 * @target_freq: the target frequency
257 * @relation: how that frequency relates to achieved frequency
258 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
259 *
260 * Sets a new CPUFreq policy.
261 */
262static int speedstep_target(struct cpufreq_policy *policy,
263 unsigned int target_freq,
264 unsigned int relation)
265{
266 unsigned int newstate = 0, policy_cpu;
267 struct cpufreq_freqs freqs;
268 int i;
269
270 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
271 target_freq, relation, &newstate))
272 return -EINVAL;
273
274 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
275 freqs.old = speedstep_get(policy_cpu);
276 freqs.new = speedstep_freqs[newstate].frequency;
277 freqs.cpu = policy->cpu;
278
279 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
280
281 /* no transition necessary */
282 if (freqs.old == freqs.new)
283 return 0;
284
285 for_each_cpu(i, policy->cpus) {
286 freqs.cpu = i;
287 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
288 }
289
290 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
291 true);
292
293 for_each_cpu(i, policy->cpus) {
294 freqs.cpu = i;
295 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
296 }
297
298 return 0;
299}
300
301
302/**
303 * speedstep_verify - verifies a new CPUFreq policy
304 * @policy: new policy
305 *
306 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
307 * at least one border included.
308 */
309static int speedstep_verify(struct cpufreq_policy *policy)
310{
311 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
312}
313
314struct get_freqs {
315 struct cpufreq_policy *policy;
316 int ret;
317};
318
319static void get_freqs_on_cpu(void *_get_freqs)
320{
321 struct get_freqs *get_freqs = _get_freqs;
322
323 get_freqs->ret =
324 speedstep_get_freqs(speedstep_processor,
325 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
326 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
327 &get_freqs->policy->cpuinfo.transition_latency,
328 &speedstep_set_state);
329}
330
331static int speedstep_cpu_init(struct cpufreq_policy *policy)
332{
333 int result;
334 unsigned int policy_cpu, speed;
335 struct get_freqs gf;
336
337 /* only run on CPU to be set, or on its sibling */
338#ifdef CONFIG_SMP
339 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
340#endif
341 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
342
343 /* detect low and high frequency and transition latency */
344 gf.policy = policy;
345 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
346 if (gf.ret)
347 return gf.ret;
348
349 /* get current speed setting */
350 speed = speedstep_get(policy_cpu);
351 if (!speed)
352 return -EIO;
353
354 dprintk("currently at %s speed setting - %i MHz\n",
355 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
356 ? "low" : "high",
357 (speed / 1000));
358
359 /* cpuinfo and default policy values */
360 policy->cur = speed;
361
362 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
363 if (result)
364 return result;
365
366 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
367
368 return 0;
369}
370
371
372static int speedstep_cpu_exit(struct cpufreq_policy *policy)
373{
374 cpufreq_frequency_table_put_attr(policy->cpu);
375 return 0;
376}
377
378static struct freq_attr *speedstep_attr[] = {
379 &cpufreq_freq_attr_scaling_available_freqs,
380 NULL,
381};
382
383
384static struct cpufreq_driver speedstep_driver = {
385 .name = "speedstep-ich",
386 .verify = speedstep_verify,
387 .target = speedstep_target,
388 .init = speedstep_cpu_init,
389 .exit = speedstep_cpu_exit,
390 .get = speedstep_get,
391 .owner = THIS_MODULE,
392 .attr = speedstep_attr,
393};
394
395
396/**
397 * speedstep_init - initializes the SpeedStep CPUFreq driver
398 *
399 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
400 * devices, -EINVAL on problems during initiatization, and zero on
401 * success.
402 */
403static int __init speedstep_init(void)
404{
405 /* detect processor */
406 speedstep_processor = speedstep_detect_processor();
407 if (!speedstep_processor) {
408 dprintk("Intel(R) SpeedStep(TM) capable processor "
409 "not found\n");
410 return -ENODEV;
411 }
412
413 /* detect chipset */
414 if (!speedstep_detect_chipset()) {
415 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
416 "(yet) available.\n");
417 return -ENODEV;
418 }
419
420 /* activate speedstep support */
421 if (speedstep_activate()) {
422 pci_dev_put(speedstep_chipset_dev);
423 return -EINVAL;
424 }
425
426 if (speedstep_find_register())
427 return -ENODEV;
428
429 return cpufreq_register_driver(&speedstep_driver);
430}
431
432
433/**
434 * speedstep_exit - unregisters SpeedStep support
435 *
436 * Unregisters SpeedStep support.
437 */
438static void __exit speedstep_exit(void)
439{
440 pci_dev_put(speedstep_chipset_dev);
441 cpufreq_unregister_driver(&speedstep_driver);
442}
443
444
445MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
446 "Dominik Brodowski <linux@brodo.de>");
447MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
448 "with ICH-M southbridges.");
449MODULE_LICENSE("GPL");
450
451module_init(speedstep_init);
452module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16
17#include <asm/msr.h>
18#include <asm/tsc.h>
19#include "speedstep-lib.h"
20
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
22 "speedstep-lib", msg)
23
24#define PFX "speedstep-lib: "
25
26#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
27static int relaxed_check;
28#else
29#define relaxed_check 0
30#endif
31
32/*********************************************************************
33 * GET PROCESSOR CORE SPEED IN KHZ *
34 *********************************************************************/
35
36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
37{
38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
39 struct {
40 unsigned int ratio; /* Frequency Multiplier (x10) */
41 u8 bitmap; /* power on configuration bits
42 [27, 25:22] (in MSR 0x2a) */
43 } msr_decode_mult[] = {
44 { 30, 0x01 },
45 { 35, 0x05 },
46 { 40, 0x02 },
47 { 45, 0x06 },
48 { 50, 0x00 },
49 { 55, 0x04 },
50 { 60, 0x0b },
51 { 65, 0x0f },
52 { 70, 0x09 },
53 { 75, 0x0d },
54 { 80, 0x0a },
55 { 85, 0x26 },
56 { 90, 0x20 },
57 { 100, 0x2b },
58 { 0, 0xff } /* error or unknown value */
59 };
60
61 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
62 struct {
63 unsigned int value; /* Front Side Bus speed in MHz */
64 u8 bitmap; /* power on configuration bits [18: 19]
65 (in MSR 0x2a) */
66 } msr_decode_fsb[] = {
67 { 66, 0x0 },
68 { 100, 0x2 },
69 { 133, 0x1 },
70 { 0, 0xff}
71 };
72
73 u32 msr_lo, msr_tmp;
74 int i = 0, j = 0;
75
76 /* read MSR 0x2a - we only need the low 32 bits */
77 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
78 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
79 msr_tmp = msr_lo;
80
81 /* decode the FSB */
82 msr_tmp &= 0x00c0000;
83 msr_tmp >>= 18;
84 while (msr_tmp != msr_decode_fsb[i].bitmap) {
85 if (msr_decode_fsb[i].bitmap == 0xff)
86 return 0;
87 i++;
88 }
89
90 /* decode the multiplier */
91 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
92 dprintk("workaround for early PIIIs\n");
93 msr_lo &= 0x03c00000;
94 } else
95 msr_lo &= 0x0bc00000;
96 msr_lo >>= 22;
97 while (msr_lo != msr_decode_mult[j].bitmap) {
98 if (msr_decode_mult[j].bitmap == 0xff)
99 return 0;
100 j++;
101 }
102
103 dprintk("speed is %u\n",
104 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
105
106 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
107}
108
109
110static unsigned int pentiumM_get_frequency(void)
111{
112 u32 msr_lo, msr_tmp;
113
114 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
115 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
116
117 /* see table B-2 of 24547212.pdf */
118 if (msr_lo & 0x00040000) {
119 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
120 msr_lo, msr_tmp);
121 return 0;
122 }
123
124 msr_tmp = (msr_lo >> 22) & 0x1f;
125 dprintk("bits 22-26 are 0x%x, speed is %u\n",
126 msr_tmp, (msr_tmp * 100 * 1000));
127
128 return msr_tmp * 100 * 1000;
129}
130
131static unsigned int pentium_core_get_frequency(void)
132{
133 u32 fsb = 0;
134 u32 msr_lo, msr_tmp;
135 int ret;
136
137 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
138 /* see table B-2 of 25366920.pdf */
139 switch (msr_lo & 0x07) {
140 case 5:
141 fsb = 100000;
142 break;
143 case 1:
144 fsb = 133333;
145 break;
146 case 3:
147 fsb = 166667;
148 break;
149 case 2:
150 fsb = 200000;
151 break;
152 case 0:
153 fsb = 266667;
154 break;
155 case 4:
156 fsb = 333333;
157 break;
158 default:
159 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
160 }
161
162 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
163 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
164 msr_lo, msr_tmp);
165
166 msr_tmp = (msr_lo >> 22) & 0x1f;
167 dprintk("bits 22-26 are 0x%x, speed is %u\n",
168 msr_tmp, (msr_tmp * fsb));
169
170 ret = (msr_tmp * fsb);
171 return ret;
172}
173
174
175static unsigned int pentium4_get_frequency(void)
176{
177 struct cpuinfo_x86 *c = &boot_cpu_data;
178 u32 msr_lo, msr_hi, mult;
179 unsigned int fsb = 0;
180 unsigned int ret;
181 u8 fsb_code;
182
183 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
184 * to System Bus Frequency Ratio Field in the Processor Frequency
185 * Configuration Register of the MSR. Therefore the current
186 * frequency cannot be calculated and has to be measured.
187 */
188 if (c->x86_model < 2)
189 return cpu_khz;
190
191 rdmsr(0x2c, msr_lo, msr_hi);
192
193 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
194
195 /* decode the FSB: see IA-32 Intel (C) Architecture Software
196 * Developer's Manual, Volume 3: System Prgramming Guide,
197 * revision #12 in Table B-1: MSRs in the Pentium 4 and
198 * Intel Xeon Processors, on page B-4 and B-5.
199 */
200 fsb_code = (msr_lo >> 16) & 0x7;
201 switch (fsb_code) {
202 case 0:
203 fsb = 100 * 1000;
204 break;
205 case 1:
206 fsb = 13333 * 10;
207 break;
208 case 2:
209 fsb = 200 * 1000;
210 break;
211 }
212
213 if (!fsb)
214 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
215 "Please send an e-mail to <linux@brodo.de>\n");
216
217 /* Multiplier. */
218 mult = msr_lo >> 24;
219
220 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
221 fsb, mult, (fsb * mult));
222
223 ret = (fsb * mult);
224 return ret;
225}
226
227
228/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
230{
231 switch (processor) {
232 case SPEEDSTEP_CPU_PCORE:
233 return pentium_core_get_frequency();
234 case SPEEDSTEP_CPU_PM:
235 return pentiumM_get_frequency();
236 case SPEEDSTEP_CPU_P4D:
237 case SPEEDSTEP_CPU_P4M:
238 return pentium4_get_frequency();
239 case SPEEDSTEP_CPU_PIII_T:
240 case SPEEDSTEP_CPU_PIII_C:
241 case SPEEDSTEP_CPU_PIII_C_EARLY:
242 return pentium3_get_frequency(processor);
243 default:
244 return 0;
245 };
246 return 0;
247}
248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
249
250
251/*********************************************************************
252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
253 *********************************************************************/
254
255unsigned int speedstep_detect_processor(void)
256{
257 struct cpuinfo_x86 *c = &cpu_data(0);
258 u32 ebx, msr_lo, msr_hi;
259
260 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
261
262 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
263 ((c->x86 != 6) && (c->x86 != 0xF)))
264 return 0;
265
266 if (c->x86 == 0xF) {
267 /* Intel Mobile Pentium 4-M
268 * or Intel Mobile Pentium 4 with 533 MHz FSB */
269 if (c->x86_model != 2)
270 return 0;
271
272 ebx = cpuid_ebx(0x00000001);
273 ebx &= 0x000000FF;
274
275 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
276
277 switch (c->x86_mask) {
278 case 4:
279 /*
280 * B-stepping [M-P4-M]
281 * sample has ebx = 0x0f, production has 0x0e.
282 */
283 if ((ebx == 0x0e) || (ebx == 0x0f))
284 return SPEEDSTEP_CPU_P4M;
285 break;
286 case 7:
287 /*
288 * C-stepping [M-P4-M]
289 * needs to have ebx=0x0e, else it's a celeron:
290 * cf. 25130917.pdf / page 7, footnote 5 even
291 * though 25072120.pdf / page 7 doesn't say
292 * samples are only of B-stepping...
293 */
294 if (ebx == 0x0e)
295 return SPEEDSTEP_CPU_P4M;
296 break;
297 case 9:
298 /*
299 * D-stepping [M-P4-M or M-P4/533]
300 *
301 * this is totally strange: CPUID 0x0F29 is
302 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
303 * The latter need to be sorted out as they don't
304 * support speedstep.
305 * Celerons with CPUID 0x0F29 may have either
306 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
307 * specific.
308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
310 * also, M-P4M HTs have ebx=0x8, too
311 * For now, they are distinguished by the model_id
312 * string
313 */
314 if ((ebx == 0x0e) ||
315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
318 break;
319 default:
320 break;
321 }
322 return 0;
323 }
324
325 switch (c->x86_model) {
326 case 0x0B: /* Intel PIII [Tualatin] */
327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
329 ebx = cpuid_ebx(0x00000001);
330 dprintk("ebx is %x\n", ebx);
331
332 ebx &= 0x000000FF;
333
334 if (ebx != 0x06)
335 return 0;
336
337 /* So far all PIII-M processors support SpeedStep. See
338 * Intel's 24540640.pdf of June 2003
339 */
340 return SPEEDSTEP_CPU_PIII_T;
341
342 case 0x08: /* Intel PIII [Coppermine] */
343
344 /* all mobile PIII Coppermines have FSB 100 MHz
345 * ==> sort out a few desktop PIIIs. */
346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
349 msr_lo &= 0x00c0000;
350 if (msr_lo != 0x0080000)
351 return 0;
352
353 /*
354 * If the processor is a mobile version,
355 * platform ID has bit 50 set
356 * it has SpeedStep technology if either
357 * bit 56 or 57 is set
358 */
359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
364 if (c->x86_mask == 0x01) {
365 dprintk("early PIII version\n");
366 return SPEEDSTEP_CPU_PIII_C_EARLY;
367 } else
368 return SPEEDSTEP_CPU_PIII_C;
369 }
370
371 default:
372 return 0;
373 }
374}
375EXPORT_SYMBOL_GPL(speedstep_detect_processor);
376
377
378/*********************************************************************
379 * DETECT SPEEDSTEP SPEEDS *
380 *********************************************************************/
381
382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
383 unsigned int *low_speed,
384 unsigned int *high_speed,
385 unsigned int *transition_latency,
386 void (*set_state) (unsigned int state))
387{
388 unsigned int prev_speed;
389 unsigned int ret = 0;
390 unsigned long flags;
391 struct timeval tv1, tv2;
392
393 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
394 return -EINVAL;
395
396 dprintk("trying to determine both speeds\n");
397
398 /* get current speed */
399 prev_speed = speedstep_get_frequency(processor);
400 if (!prev_speed)
401 return -EIO;
402
403 dprintk("previous speed is %u\n", prev_speed);
404
405 local_irq_save(flags);
406
407 /* switch to low state */
408 set_state(SPEEDSTEP_LOW);
409 *low_speed = speedstep_get_frequency(processor);
410 if (!*low_speed) {
411 ret = -EIO;
412 goto out;
413 }
414
415 dprintk("low speed is %u\n", *low_speed);
416
417 /* start latency measurement */
418 if (transition_latency)
419 do_gettimeofday(&tv1);
420
421 /* switch to high state */
422 set_state(SPEEDSTEP_HIGH);
423
424 /* end latency measurement */
425 if (transition_latency)
426 do_gettimeofday(&tv2);
427
428 *high_speed = speedstep_get_frequency(processor);
429 if (!*high_speed) {
430 ret = -EIO;
431 goto out;
432 }
433
434 dprintk("high speed is %u\n", *high_speed);
435
436 if (*low_speed == *high_speed) {
437 ret = -ENODEV;
438 goto out;
439 }
440
441 /* switch to previous state, if necessary */
442 if (*high_speed != prev_speed)
443 set_state(SPEEDSTEP_LOW);
444
445 if (transition_latency) {
446 *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
447 tv2.tv_usec - tv1.tv_usec;
448 dprintk("transition latency is %u uSec\n", *transition_latency);
449
450 /* convert uSec to nSec and add 20% for safety reasons */
451 *transition_latency *= 1200;
452
453 /* check if the latency measurement is too high or too low
454 * and set it to a safe value (500uSec) in that case
455 */
456 if (*transition_latency > 10000000 ||
457 *transition_latency < 50000) {
458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
462 *transition_latency, 500000);
463 *transition_latency = 500000;
464 }
465 }
466
467out:
468 local_irq_restore(flags);
469 return ret;
470}
471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
472
473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
474module_param(relaxed_check, int, 0444);
475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
477#endif
478
479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14enum speedstep_processor {
15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19/* the following processors are not speedstep-capable and are not auto-detected
20 * in speedstep_detect_processor(). However, their speed can be detected using
21 * the speedstep_get_frequency() call. */
22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26
27/* speedstep states -- only two of them */
28
29#define SPEEDSTEP_HIGH 0x00000000
30#define SPEEDSTEP_LOW 0x00000001
31
32
33/* detect a speedstep-capable processor */
34extern enum speedstep_processor speedstep_detect_processor(void);
35
36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38
39
40/* detect the low and high speeds of the processor. The callback
41 * set_state"'s first argument is either SPEEDSTEP_HIGH or
42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated.
44 */
45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed,
47 unsigned int *high_speed,
48 unsigned int *transition_latency,
49 void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 91bc25b67bc1..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/delay.h>
21#include <linux/io.h>
22#include <asm/ist.h>
23
24#include "speedstep-lib.h"
25
26/* speedstep system management interface port/command.
27 *
28 * These parameters are got from IST-SMI BIOS call.
29 * If user gives it, these are used.
30 *
31 */
32static int smi_port;
33static int smi_cmd;
34static unsigned int smi_sig;
35
36/* info about the processor */
37static enum speedstep_processor speedstep_processor;
38
39/*
40 * There are only two frequency states for each processor. Values
41 * are in kHz for the time being.
42 */
43static struct cpufreq_frequency_table speedstep_freqs[] = {
44 {SPEEDSTEP_HIGH, 0},
45 {SPEEDSTEP_LOW, 0},
46 {0, CPUFREQ_TABLE_END},
47};
48
49#define GET_SPEEDSTEP_OWNER 0
50#define GET_SPEEDSTEP_STATE 1
51#define SET_SPEEDSTEP_STATE 2
52#define GET_SPEEDSTEP_FREQS 4
53
54/* how often shall the SMI call be tried if it failed, e.g. because
55 * of DMA activity going on? */
56#define SMI_TRIES 5
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
59 "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership(void)
65{
66 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n",
74 command, smi_port);
75
76 __asm__ __volatile__(
77 "push %%ebp\n"
78 "out %%al, (%%dx)\n"
79 "pop %%ebp\n"
80 : "=D" (result),
81 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
82 "=S" (dummy)
83 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
84 "D" (0), "S" (magic)
85 : "memory"
86 );
87
88 dprintk("result is %x\n", result);
89
90 return result;
91}
92
93/**
94 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
95 * @low: the low frequency value is placed here
96 * @high: the high frequency value is placed here
97 *
98 * Only available on later SpeedStep-enabled systems, returns false results or
99 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
100 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
101 */
102static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
103{
104 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
105 u32 state = 0;
106 u32 function = GET_SPEEDSTEP_FREQS;
107
108 if (!(ist_info.event & 0xFFFF)) {
109 dprintk("bug #1422 -- can't read freqs from BIOS\n");
110 return -ENODEV;
111 }
112
113 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
114
115 dprintk("trying to determine frequencies with command %x at port %x\n",
116 command, smi_port);
117
118 __asm__ __volatile__(
119 "push %%ebp\n"
120 "out %%al, (%%dx)\n"
121 "pop %%ebp"
122 : "=a" (result),
123 "=b" (high_mhz),
124 "=c" (low_mhz),
125 "=d" (state), "=D" (edi), "=S" (dummy)
126 : "a" (command),
127 "b" (function),
128 "c" (state),
129 "d" (smi_port), "S" (0), "D" (0)
130 );
131
132 dprintk("result %x, low_freq %u, high_freq %u\n",
133 result, low_mhz, high_mhz);
134
135 /* abort if results are obviously incorrect... */
136 if ((high_mhz + low_mhz) < 600)
137 return -EINVAL;
138
139 *high = high_mhz * 1000;
140 *low = low_mhz * 1000;
141
142 return result;
143}
144
145/**
146 * speedstep_get_state - set the SpeedStep state
147 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
148 *
149 */
150static int speedstep_get_state(void)
151{
152 u32 function = GET_SPEEDSTEP_STATE;
153 u32 result, state, edi, command, dummy;
154
155 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
156
157 dprintk("trying to determine current setting with command %x "
158 "at port %x\n", command, smi_port);
159
160 __asm__ __volatile__(
161 "push %%ebp\n"
162 "out %%al, (%%dx)\n"
163 "pop %%ebp\n"
164 : "=a" (result),
165 "=b" (state), "=D" (edi),
166 "=c" (dummy), "=d" (dummy), "=S" (dummy)
167 : "a" (command), "b" (function), "c" (0),
168 "d" (smi_port), "S" (0), "D" (0)
169 );
170
171 dprintk("state is %x, result is %x\n", state, result);
172
173 return state & 1;
174}
175
176
177/**
178 * speedstep_set_state - set the SpeedStep state
179 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
180 *
181 */
182static void speedstep_set_state(unsigned int state)
183{
184 unsigned int result = 0, command, new_state, dummy;
185 unsigned long flags;
186 unsigned int function = SET_SPEEDSTEP_STATE;
187 unsigned int retry = 0;
188
189 if (state > 0x1)
190 return;
191
192 /* Disable IRQs */
193 local_irq_save(flags);
194
195 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
196
197 dprintk("trying to set frequency to state %u "
198 "with command %x at port %x\n",
199 state, command, smi_port);
200
201 do {
202 if (retry) {
203 dprintk("retry %u, previous result %u, waiting...\n",
204 retry, result);
205 mdelay(retry * 50);
206 }
207 retry++;
208 __asm__ __volatile__(
209 "push %%ebp\n"
210 "out %%al, (%%dx)\n"
211 "pop %%ebp"
212 : "=b" (new_state), "=D" (result),
213 "=c" (dummy), "=a" (dummy),
214 "=d" (dummy), "=S" (dummy)
215 : "a" (command), "b" (function), "c" (state),
216 "d" (smi_port), "S" (0), "D" (0)
217 );
218 } while ((new_state != state) && (retry <= SMI_TRIES));
219
220 /* enable IRQs */
221 local_irq_restore(flags);
222
223 if (new_state == state)
224 dprintk("change to %u MHz succeeded after %u tries "
225 "with result %u\n",
226 (speedstep_freqs[new_state].frequency / 1000),
227 retry, result);
228 else
229 printk(KERN_ERR "cpufreq: change to state %u "
230 "failed with new_state %u and result %u\n",
231 state, new_state, result);
232
233 return;
234}
235
236
237/**
238 * speedstep_target - set a new CPUFreq policy
239 * @policy: new policy
240 * @target_freq: new freq
241 * @relation:
242 *
243 * Sets a new CPUFreq policy/freq.
244 */
245static int speedstep_target(struct cpufreq_policy *policy,
246 unsigned int target_freq, unsigned int relation)
247{
248 unsigned int newstate = 0;
249 struct cpufreq_freqs freqs;
250
251 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
252 target_freq, relation, &newstate))
253 return -EINVAL;
254
255 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
256 freqs.new = speedstep_freqs[newstate].frequency;
257 freqs.cpu = 0; /* speedstep.c is UP only driver */
258
259 if (freqs.old == freqs.new)
260 return 0;
261
262 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
263 speedstep_set_state(newstate);
264 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
265
266 return 0;
267}
268
269
270/**
271 * speedstep_verify - verifies a new CPUFreq policy
272 * @policy: new policy
273 *
274 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
275 * at least one border included.
276 */
277static int speedstep_verify(struct cpufreq_policy *policy)
278{
279 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
280}
281
282
283static int speedstep_cpu_init(struct cpufreq_policy *policy)
284{
285 int result;
286 unsigned int speed, state;
287 unsigned int *low, *high;
288
289 /* capability check */
290 if (policy->cpu != 0)
291 return -ENODEV;
292
293 result = speedstep_smi_ownership();
294 if (result) {
295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL;
297 }
298
299 /* detect low and high frequency */
300 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
301 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
302
303 result = speedstep_smi_get_freqs(low, high);
304 if (result) {
305 /* fall back to speedstep_lib.c dection mechanism:
306 * try both states out */
307 dprintk("could not detect low and high frequencies "
308 "by SMI call.\n");
309 result = speedstep_get_freqs(speedstep_processor,
310 low, high,
311 NULL,
312 &speedstep_set_state);
313
314 if (result) {
315 dprintk("could not detect two different speeds"
316 " -- aborting.\n");
317 return result;
318 } else
319 dprintk("workaround worked.\n");
320 }
321
322 /* get current speed setting */
323 state = speedstep_get_state();
324 speed = speedstep_freqs[state].frequency;
325
326 dprintk("currently at %s speed setting - %i MHz\n",
327 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
328 ? "low" : "high",
329 (speed / 1000));
330
331 /* cpuinfo and default policy values */
332 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
333 policy->cur = speed;
334
335 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
336 if (result)
337 return result;
338
339 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
340
341 return 0;
342}
343
344static int speedstep_cpu_exit(struct cpufreq_policy *policy)
345{
346 cpufreq_frequency_table_put_attr(policy->cpu);
347 return 0;
348}
349
350static unsigned int speedstep_get(unsigned int cpu)
351{
352 if (cpu)
353 return -ENODEV;
354 return speedstep_get_frequency(speedstep_processor);
355}
356
357
358static int speedstep_resume(struct cpufreq_policy *policy)
359{
360 int result = speedstep_smi_ownership();
361
362 if (result)
363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364
365 return result;
366}
367
368static struct freq_attr *speedstep_attr[] = {
369 &cpufreq_freq_attr_scaling_available_freqs,
370 NULL,
371};
372
373static struct cpufreq_driver speedstep_driver = {
374 .name = "speedstep-smi",
375 .verify = speedstep_verify,
376 .target = speedstep_target,
377 .init = speedstep_cpu_init,
378 .exit = speedstep_cpu_exit,
379 .get = speedstep_get,
380 .resume = speedstep_resume,
381 .owner = THIS_MODULE,
382 .attr = speedstep_attr,
383};
384
385/**
386 * speedstep_init - initializes the SpeedStep CPUFreq driver
387 *
388 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
389 * BIOS, -EINVAL on problems during initiatization, and zero on
390 * success.
391 */
392static int __init speedstep_init(void)
393{
394 speedstep_processor = speedstep_detect_processor();
395
396 switch (speedstep_processor) {
397 case SPEEDSTEP_CPU_PIII_T:
398 case SPEEDSTEP_CPU_PIII_C:
399 case SPEEDSTEP_CPU_PIII_C_EARLY:
400 break;
401 default:
402 speedstep_processor = 0;
403 }
404
405 if (!speedstep_processor) {
406 dprintk("No supported Intel CPU detected.\n");
407 return -ENODEV;
408 }
409
410 dprintk("signature:0x%.8lx, command:0x%.8lx, "
411 "event:0x%.8lx, perf_level:0x%.8lx.\n",
412 ist_info.signature, ist_info.command,
413 ist_info.event, ist_info.perf_level);
414
415 /* Error if no IST-SMI BIOS or no PARM
416 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
417 if ((ist_info.signature != 0x47534943) && (
418 (smi_port == 0) || (smi_cmd == 0)))
419 return -ENODEV;
420
421 if (smi_sig == 1)
422 smi_sig = 0x47534943;
423 else
424 smi_sig = ist_info.signature;
425
426 /* setup smi_port from MODLULE_PARM or BIOS */
427 if ((smi_port > 0xff) || (smi_port < 0))
428 return -EINVAL;
429 else if (smi_port == 0)
430 smi_port = ist_info.command & 0xff;
431
432 if ((smi_cmd > 0xff) || (smi_cmd < 0))
433 return -EINVAL;
434 else if (smi_cmd == 0)
435 smi_cmd = (ist_info.command >> 16) & 0xff;
436
437 return cpufreq_register_driver(&speedstep_driver);
438}
439
440
441/**
442 * speedstep_exit - unregisters SpeedStep support
443 *
444 * Unregisters SpeedStep support.
445 */
446static void __exit speedstep_exit(void)
447{
448 cpufreq_unregister_driver(&speedstep_driver);
449}
450
451module_param(smi_port, int, 0444);
452module_param(smi_cmd, int, 0444);
453module_param(smi_sig, uint, 0444);
454
455MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
456 "-- Intel's default setting is 0xb2");
457MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
458 "-- Intel's default setting is 0x82");
459MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
460 "SMI interface.");
461
462MODULE_AUTHOR("Hiroshi Miura");
463MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
464MODULE_LICENSE("GPL");
465
466module_init(speedstep_init);
467module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859d..1edf5ba4fb2b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
29 29
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 31{
32 u64 misc_enable;
33
32 /* Unmask CPUID levels if masked: */ 34 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { 35 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37 37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { 38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
118 * (model 2) with the same problem. 118 * (model 2) with the same problem.
119 */ 119 */
120 if (c->x86 == 15) { 120 if (c->x86 == 15) {
121 u64 misc_enable;
122
123 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 121 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
124 122
125 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { 123 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
130 } 128 }
131 } 129 }
132#endif 130#endif
131
132 /*
133 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
134 * clear the fast string and enhanced fast string CPU capabilities.
135 */
136 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
137 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
138 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
139 printk(KERN_INFO "Disabled fast string operations\n");
140 setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
141 setup_clear_cpu_cap(X86_FEATURE_ERMS);
142 }
143 }
133} 144}
134 145
135#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
@@ -400,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 411
401 switch (c->x86_model) { 412 switch (c->x86_model) {
402 case 5: 413 case 5:
403 if (c->x86_mask == 0) { 414 if (l2 == 0)
404 if (l2 == 0) 415 p = "Celeron (Covington)";
405 p = "Celeron (Covington)"; 416 else if (l2 == 256)
406 else if (l2 == 256) 417 p = "Mobile Pentium II (Dixon)";
407 p = "Mobile Pentium II (Dixon)";
408 }
409 break; 418 break;
410 419
411 case 6: 420 case 6:
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899df..c105c533ed94 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); 327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
329 329
330 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
331 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
332} 331}
333 332
@@ -454,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
454{ 453{
455 int ret = 0; 454 int ret = 0;
456 455
457#define SUBCACHE_MASK (3UL << 20) 456 /* check if @slot is already used or the index is already disabled */
458#define SUBCACHE_INDEX 0xfff
459
460 /*
461 * check whether this slot is already used or
462 * the index is already disabled
463 */
464 ret = amd_get_l3_disable_slot(l3, slot); 457 ret = amd_get_l3_disable_slot(l3, slot);
465 if (ret >= 0) 458 if (ret >= 0)
466 return -EINVAL; 459 return -EINVAL;
467 460
468 /* 461 if (index > l3->indices)
469 * check whether the other slot has disabled the
470 * same index already
471 */
472 if (index == amd_get_l3_disable_slot(l3, !slot))
473 return -EINVAL; 462 return -EINVAL;
474 463
475 /* do not allow writes outside of allowed bits */ 464 /* check whether the other slot has disabled the same index already */
476 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 465 if (index == amd_get_l3_disable_slot(l3, !slot))
477 ((index & SUBCACHE_INDEX) > l3->indices))
478 return -EINVAL; 466 return -EINVAL;
479 467
480 amd_l3_disable_index(l3, cpu, slot, index); 468 amd_l3_disable_index(l3, cpu, slot, index);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f684..ff1ae9b6464d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,20 +105,6 @@ static int cpu_missing;
105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107 107
108static int default_decode_mce(struct notifier_block *nb, unsigned long val,
109 void *data)
110{
111 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
112 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
113
114 return NOTIFY_STOP;
115}
116
117static struct notifier_block mce_dec_nb = {
118 .notifier_call = default_decode_mce,
119 .priority = -1,
120};
121
122/* MCA banks polled by the period polling timer for corrected events */ 108/* MCA banks polled by the period polling timer for corrected events */
123DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 109DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
124 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 110 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +198,8 @@ void mce_log(struct mce *mce)
212 198
213static void print_mce(struct mce *m) 199static void print_mce(struct mce *m)
214{ 200{
201 int ret = 0;
202
215 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 203 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
216 m->extcpu, m->mcgstatus, m->bank, m->status); 204 m->extcpu, m->mcgstatus, m->bank, m->status);
217 205
@@ -239,7 +227,11 @@ static void print_mce(struct mce *m)
239 * Print out human-readable details about the MCE error, 227 * Print out human-readable details about the MCE error,
240 * (if the CPU has an implementation for that) 228 * (if the CPU has an implementation for that)
241 */ 229 */
242 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 230 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
231 if (ret == NOTIFY_STOP)
232 return;
233
234 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
243} 235}
244 236
245#define PANIC_TIMEOUT 5 /* 5 seconds */ 237#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -590,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
591 mce_log(&m); 583 mce_log(&m);
592 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 584 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
593 add_taint(TAINT_MACHINE_CHECK);
594 } 585 }
595 586
596 /* 587 /*
@@ -1722,8 +1713,6 @@ __setup("mce", mcheck_enable);
1722 1713
1723int __init mcheck_init(void) 1714int __init mcheck_init(void)
1724{ 1715{
1725 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1726
1727 mcheck_intel_therm_init(); 1716 mcheck_intel_therm_init();
1728 1717
1729 return 0; 1718 return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 167f97b5596e..bb0adad35143 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -509,6 +509,7 @@ recurse:
509out_free: 509out_free:
510 if (b) { 510 if (b) {
511 kobject_put(&b->kobj); 511 kobject_put(&b->kobj);
512 list_del(&b->miscj);
512 kfree(b); 513 kfree(b);
513 } 514 }
514 return err; 515 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97f..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
187 this_cpu, 187 this_cpu,
188 level == CORE_LEVEL ? "Core" : "Package", 188 level == CORE_LEVEL ? "Core" : "Package",
189 state->count); 189 state->count);
190
191 add_taint(TAINT_MACHINE_CHECK);
192 return 1; 190 return 1;
193 } 191 }
194 if (old_event) { 192 if (old_event) {
@@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
355static void intel_thermal_interrupt(void) 353static void intel_thermal_interrupt(void)
356{ 354{
357 __u64 msr_val; 355 __u64 msr_val;
358 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
359 356
360 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 357 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
361 358
@@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void)
367 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
368 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
369 366
370 if (cpu_has(c, X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
371 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
372 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
373 CORE_LEVEL) != 0) 370 CORE_LEVEL) != 0)
374 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); 371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
375 372
376 if (cpu_has(c, X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
377 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
378 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
379 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
380 PACKAGE_LEVEL) != 0) 377 PACKAGE_LEVEL) != 0)
381 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); 378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
382 if (cpu_has(c, X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
383 if (therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
384 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
385 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
@@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void)
393{ 390{
394 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", 391 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
395 smp_processor_id()); 392 smp_processor_id());
396 add_taint(TAINT_MACHINE_CHECK);
397} 393}
398 394
399static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; 395static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -446,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
446 */ 442 */
447 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 443 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
448 444
445 h = lvtthmr_init;
449 /* 446 /*
450 * The initial value of thermal LVT entries on all APs always reads 447 * The initial value of thermal LVT entries on all APs always reads
451 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI 448 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
452 * sequence to them and LVT registers are reset to 0s except for 449 * sequence to them and LVT registers are reset to 0s except for
453 * the mask bits which are set to 1s when APs receive INIT IPI. 450 * the mask bits which are set to 1s when APs receive INIT IPI.
454 * Always restore the value that BIOS has programmed on AP based on 451 * If BIOS takes over the thermal interrupt and sets its interrupt
455 * BSP's info we saved since BIOS is always setting the same value 452 * delivery mode to SMI (not fixed), it restores the value that the
456 * for all threads/cores 453 * BIOS has programmed on AP based on BSP's info we saved since BIOS
454 * is always setting the same value for all threads/cores.
457 */ 455 */
458 apic_write(APIC_LVTTHMR, lvtthmr_init); 456 if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
457 apic_write(APIC_LVTTHMR, lvtthmr_init);
459 458
460 h = lvtthmr_init;
461 459
462 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 460 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
463 printk(KERN_DEBUG 461 printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eed3673a8656..3a0338b4b179 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/alternative.h>
34 35
35#if 0 36#if 0
36#undef wrmsrl 37#undef wrmsrl
@@ -363,12 +364,18 @@ again:
363 return new_raw_count; 364 return new_raw_count;
364} 365}
365 366
366/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
367static inline int x86_pmu_addr_offset(int index) 367static inline int x86_pmu_addr_offset(int index)
368{ 368{
369 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 369 int offset;
370 return index << 1; 370
371 return index; 371 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372 alternative_io(ASM_NOP2,
373 "shll $1, %%eax",
374 X86_FEATURE_PERFCTR_CORE,
375 "=a" (offset),
376 "a" (index));
377
378 return offset;
372} 379}
373 380
374static inline unsigned int x86_pmu_config_addr(int index) 381static inline unsigned int x86_pmu_config_addr(int index)
@@ -586,8 +593,12 @@ static int x86_setup_perfctr(struct perf_event *event)
586 return -EOPNOTSUPP; 593 return -EOPNOTSUPP;
587 } 594 }
588 595
596 /*
597 * Do not allow config1 (extended registers) to propagate,
598 * there's no sane user-space generalization yet:
599 */
589 if (attr->type == PERF_TYPE_RAW) 600 if (attr->type == PERF_TYPE_RAW)
590 return x86_pmu_extra_regs(event->attr.config, event); 601 return 0;
591 602
592 if (attr->type == PERF_TYPE_HW_CACHE) 603 if (attr->type == PERF_TYPE_HW_CACHE)
593 return set_ext_hw_attr(hwc, event); 604 return set_ext_hw_attr(hwc, event);
@@ -609,8 +620,8 @@ static int x86_setup_perfctr(struct perf_event *event)
609 /* 620 /*
610 * Branch tracing: 621 * Branch tracing:
611 */ 622 */
612 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 623 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
613 (hwc->sample_period == 1)) { 624 !attr->freq && hwc->sample_period == 1) {
614 /* BTS is not supported by this architecture. */ 625 /* BTS is not supported by this architecture. */
615 if (!x86_pmu.bts_active) 626 if (!x86_pmu.bts_active)
616 return -EOPNOTSUPP; 627 return -EOPNOTSUPP;
@@ -1284,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1284 1295
1285 cpuc = &__get_cpu_var(cpu_hw_events); 1296 cpuc = &__get_cpu_var(cpu_hw_events);
1286 1297
1298 /*
1299 * Some chipsets need to unmask the LVTPC in a particular spot
1300 * inside the nmi handler. As a result, the unmasking was pushed
1301 * into all the nmi handlers.
1302 *
1303 * This generic handler doesn't seem to have any issues where the
1304 * unmasking occurs so it was left at the top.
1305 */
1306 apic_write(APIC_LVTPC, APIC_DM_NMI);
1307
1287 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1308 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1288 if (!test_bit(idx, cpuc->active_mask)) { 1309 if (!test_bit(idx, cpuc->active_mask)) {
1289 /* 1310 /*
@@ -1370,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self,
1370 return NOTIFY_DONE; 1391 return NOTIFY_DONE;
1371 } 1392 }
1372 1393
1373 apic_write(APIC_LVTPC, APIC_DM_NMI);
1374
1375 handled = x86_pmu.handle_irq(args->regs); 1394 handled = x86_pmu.handle_irq(args->regs);
1376 if (!handled) 1395 if (!handled)
1377 return NOTIFY_DONE; 1396 return NOTIFY_DONE;
@@ -1754,17 +1773,6 @@ static struct pmu pmu = {
1754 * callchain support 1773 * callchain support
1755 */ 1774 */
1756 1775
1757static void
1758backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1759{
1760 /* Ignore warnings */
1761}
1762
1763static void backtrace_warning(void *data, char *msg)
1764{
1765 /* Ignore warnings */
1766}
1767
1768static int backtrace_stack(void *data, char *name) 1776static int backtrace_stack(void *data, char *name)
1769{ 1777{
1770 return 0; 1778 return 0;
@@ -1778,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1778} 1786}
1779 1787
1780static const struct stacktrace_ops backtrace_ops = { 1788static const struct stacktrace_ops backtrace_ops = {
1781 .warning = backtrace_warning,
1782 .warning_symbol = backtrace_warning_symbol,
1783 .stack = backtrace_stack, 1789 .stack = backtrace_stack,
1784 .address = backtrace_address, 1790 .address = backtrace_address,
1785 .walk_stack = print_context_stack_bp, 1791 .walk_stack = print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 461f62bbd774..fe29c1d2219e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
8 [ C(L1D) ] = { 8 [ C(L1D) ] = {
9 [ C(OP_READ) ] = { 9 [ C(OP_READ) ] = {
10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
11 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 11 [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
12 }, 12 },
13 [ C(OP_WRITE) ] = { 13 [ C(OP_WRITE) ] = {
14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ 14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
96 */ 96 */
97static const u64 amd_perfmon_event_map[] = 97static const u64 amd_perfmon_event_map[] =
98{ 98{
99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, 99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, 103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, 104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
105 [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
106 [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
105}; 107};
106 108
107static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
@@ -427,7 +429,9 @@ static __initconst const struct x86_pmu amd_pmu = {
427 * 429 *
428 * Exceptions: 430 * Exceptions:
429 * 431 *
432 * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
430 * 0x003 FP PERF_CTL[3] 433 * 0x003 FP PERF_CTL[3]
434 * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
431 * 0x00B FP PERF_CTL[3] 435 * 0x00B FP PERF_CTL[3]
432 * 0x00D FP PERF_CTL[3] 436 * 0x00D FP PERF_CTL[3]
433 * 0x023 DE PERF_CTL[2:0] 437 * 0x023 DE PERF_CTL[2:0]
@@ -448,6 +452,8 @@ static __initconst const struct x86_pmu amd_pmu = {
448 * 0x0DF LS PERF_CTL[5:0] 452 * 0x0DF LS PERF_CTL[5:0]
449 * 0x1D6 EX PERF_CTL[5:0] 453 * 0x1D6 EX PERF_CTL[5:0]
450 * 0x1D8 EX PERF_CTL[5:0] 454 * 0x1D8 EX PERF_CTL[5:0]
455 *
456 * (*) depending on the umask all FPU counters may be used
451 */ 457 */
452 458
453static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 459static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -460,18 +466,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
460static struct event_constraint * 466static struct event_constraint *
461amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) 467amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
462{ 468{
463 unsigned int event_code = amd_get_event_code(&event->hw); 469 struct hw_perf_event *hwc = &event->hw;
470 unsigned int event_code = amd_get_event_code(hwc);
464 471
465 switch (event_code & AMD_EVENT_TYPE_MASK) { 472 switch (event_code & AMD_EVENT_TYPE_MASK) {
466 case AMD_EVENT_FP: 473 case AMD_EVENT_FP:
467 switch (event_code) { 474 switch (event_code) {
475 case 0x000:
476 if (!(hwc->config & 0x0000F000ULL))
477 break;
478 if (!(hwc->config & 0x00000F00ULL))
479 break;
480 return &amd_f15_PMC3;
481 case 0x004:
482 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
483 break;
484 return &amd_f15_PMC3;
468 case 0x003: 485 case 0x003:
469 case 0x00B: 486 case 0x00B:
470 case 0x00D: 487 case 0x00D:
471 return &amd_f15_PMC3; 488 return &amd_f15_PMC3;
472 default:
473 return &amd_f15_PMC53;
474 } 489 }
490 return &amd_f15_PMC53;
475 case AMD_EVENT_LS: 491 case AMD_EVENT_LS:
476 case AMD_EVENT_DC: 492 case AMD_EVENT_DC:
477 case AMD_EVENT_EX_LS: 493 case AMD_EVENT_EX_LS:
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8fc2b2cee1da..41178c826c48 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -25,7 +25,7 @@ struct intel_percore {
25/* 25/*
26 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
27 */ 27 */
28static const u64 intel_perfmon_event_map[] = 28static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
29{ 29{
30 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 30 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
31 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 31 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -36,7 +36,7 @@ static const u64 intel_perfmon_event_map[] =
36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
37}; 37};
38 38
39static struct event_constraint intel_core_event_constraints[] = 39static struct event_constraint intel_core_event_constraints[] __read_mostly =
40{ 40{
41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
47 EVENT_CONSTRAINT_END 47 EVENT_CONSTRAINT_END
48}; 48};
49 49
50static struct event_constraint intel_core2_event_constraints[] = 50static struct event_constraint intel_core2_event_constraints[] __read_mostly =
51{ 51{
52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
70 EVENT_CONSTRAINT_END 70 EVENT_CONSTRAINT_END
71}; 71};
72 72
73static struct event_constraint intel_nehalem_event_constraints[] = 73static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
74{ 74{
75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
86 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
87}; 87};
88 88
89static struct extra_reg intel_nehalem_extra_regs[] = 89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{ 90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END 92 EVENT_EXTRA_END
93}; 93};
94 94
95static struct event_constraint intel_nehalem_percore_constraints[] = 95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{ 96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0), 97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END 98 EVENT_CONSTRAINT_END
99}; 99};
100 100
101static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
102{ 102{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
110 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
111}; 111};
112 112
113static struct event_constraint intel_snb_event_constraints[] = 113static struct event_constraint intel_snb_event_constraints[] __read_mostly =
114{ 114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] =
123 EVENT_CONSTRAINT_END 123 EVENT_CONSTRAINT_END
124}; 124};
125 125
126static struct extra_reg intel_westmere_extra_regs[] = 126static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{ 127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), 129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END 130 EVENT_EXTRA_END
131}; 131};
132 132
133static struct event_constraint intel_westmere_percore_constraints[] = 133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
134{ 134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0), 135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0), 136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END 137 EVENT_CONSTRAINT_END
138}; 138};
139 139
140static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] __read_mostly =
141{ 141{
142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -184,26 +184,23 @@ static __initconst const u64 snb_hw_cache_event_ids
184 }, 184 },
185 }, 185 },
186 [ C(LL ) ] = { 186 [ C(LL ) ] = {
187 /*
188 * TBD: Need Off-core Response Performance Monitoring support
189 */
190 [ C(OP_READ) ] = { 187 [ C(OP_READ) ] = {
191 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ 188 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
192 [ C(RESULT_ACCESS) ] = 0x01b7, 189 [ C(RESULT_ACCESS) ] = 0x01b7,
193 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ 190 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
194 [ C(RESULT_MISS) ] = 0x01bb, 191 [ C(RESULT_MISS) ] = 0x01b7,
195 }, 192 },
196 [ C(OP_WRITE) ] = { 193 [ C(OP_WRITE) ] = {
197 /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */ 194 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
198 [ C(RESULT_ACCESS) ] = 0x01b7, 195 [ C(RESULT_ACCESS) ] = 0x01b7,
199 /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */ 196 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
200 [ C(RESULT_MISS) ] = 0x01bb, 197 [ C(RESULT_MISS) ] = 0x01b7,
201 }, 198 },
202 [ C(OP_PREFETCH) ] = { 199 [ C(OP_PREFETCH) ] = {
203 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ 200 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
204 [ C(RESULT_ACCESS) ] = 0x01b7, 201 [ C(RESULT_ACCESS) ] = 0x01b7,
205 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ 202 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
206 [ C(RESULT_MISS) ] = 0x01bb, 203 [ C(RESULT_MISS) ] = 0x01b7,
207 }, 204 },
208 }, 205 },
209 [ C(DTLB) ] = { 206 [ C(DTLB) ] = {
@@ -285,26 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
285 }, 282 },
286 [ C(LL ) ] = { 283 [ C(LL ) ] = {
287 [ C(OP_READ) ] = { 284 [ C(OP_READ) ] = {
288 /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ 285 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
289 [ C(RESULT_ACCESS) ] = 0x01b7, 286 [ C(RESULT_ACCESS) ] = 0x01b7,
290 /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ 287 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
291 [ C(RESULT_MISS) ] = 0x01bb, 288 [ C(RESULT_MISS) ] = 0x01b7,
292 }, 289 },
293 /* 290 /*
294 * Use RFO, not WRITEBACK, because a write miss would typically occur 291 * Use RFO, not WRITEBACK, because a write miss would typically occur
295 * on RFO. 292 * on RFO.
296 */ 293 */
297 [ C(OP_WRITE) ] = { 294 [ C(OP_WRITE) ] = {
298 /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */ 295 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
299 [ C(RESULT_ACCESS) ] = 0x01bb, 296 [ C(RESULT_ACCESS) ] = 0x01b7,
300 /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */ 297 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
301 [ C(RESULT_MISS) ] = 0x01b7, 298 [ C(RESULT_MISS) ] = 0x01b7,
302 }, 299 },
303 [ C(OP_PREFETCH) ] = { 300 [ C(OP_PREFETCH) ] = {
304 /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ 301 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
305 [ C(RESULT_ACCESS) ] = 0x01b7, 302 [ C(RESULT_ACCESS) ] = 0x01b7,
306 /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ 303 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
307 [ C(RESULT_MISS) ] = 0x01bb, 304 [ C(RESULT_MISS) ] = 0x01b7,
308 }, 305 },
309 }, 306 },
310 [ C(DTLB) ] = { 307 [ C(DTLB) ] = {
@@ -352,16 +349,36 @@ static __initconst const u64 westmere_hw_cache_event_ids
352}; 349};
353 350
354/* 351/*
355 * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3 352 * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
353 * See IA32 SDM Vol 3B 30.6.1.3
356 */ 354 */
357 355
358#define DMND_DATA_RD (1 << 0) 356#define NHM_DMND_DATA_RD (1 << 0)
359#define DMND_RFO (1 << 1) 357#define NHM_DMND_RFO (1 << 1)
360#define DMND_WB (1 << 3) 358#define NHM_DMND_IFETCH (1 << 2)
361#define PF_DATA_RD (1 << 4) 359#define NHM_DMND_WB (1 << 3)
362#define PF_DATA_RFO (1 << 5) 360#define NHM_PF_DATA_RD (1 << 4)
363#define RESP_UNCORE_HIT (1 << 8) 361#define NHM_PF_DATA_RFO (1 << 5)
364#define RESP_MISS (0xf600) /* non uncore hit */ 362#define NHM_PF_IFETCH (1 << 6)
363#define NHM_OFFCORE_OTHER (1 << 7)
364#define NHM_UNCORE_HIT (1 << 8)
365#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
366#define NHM_OTHER_CORE_HITM (1 << 10)
367 /* reserved */
368#define NHM_REMOTE_CACHE_FWD (1 << 12)
369#define NHM_REMOTE_DRAM (1 << 13)
370#define NHM_LOCAL_DRAM (1 << 14)
371#define NHM_NON_DRAM (1 << 15)
372
373#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
374
375#define NHM_DMND_READ (NHM_DMND_DATA_RD)
376#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
377#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
378
379#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
380#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
381#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
365 382
366static __initconst const u64 nehalem_hw_cache_extra_regs 383static __initconst const u64 nehalem_hw_cache_extra_regs
367 [PERF_COUNT_HW_CACHE_MAX] 384 [PERF_COUNT_HW_CACHE_MAX]
@@ -370,16 +387,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
370{ 387{
371 [ C(LL ) ] = { 388 [ C(LL ) ] = {
372 [ C(OP_READ) ] = { 389 [ C(OP_READ) ] = {
373 [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT, 390 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
374 [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS, 391 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
375 }, 392 },
376 [ C(OP_WRITE) ] = { 393 [ C(OP_WRITE) ] = {
377 [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT, 394 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
378 [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS, 395 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
379 }, 396 },
380 [ C(OP_PREFETCH) ] = { 397 [ C(OP_PREFETCH) ] = {
381 [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT, 398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
382 [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS, 399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
383 }, 400 },
384 } 401 }
385}; 402};
@@ -391,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
391{ 408{
392 [ C(L1D) ] = { 409 [ C(L1D) ] = {
393 [ C(OP_READ) ] = { 410 [ C(OP_READ) ] = {
394 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ 411 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
395 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ 412 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
396 }, 413 },
397 [ C(OP_WRITE) ] = { 414 [ C(OP_WRITE) ] = {
398 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ 415 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
399 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ 416 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
400 }, 417 },
401 [ C(OP_PREFETCH) ] = { 418 [ C(OP_PREFETCH) ] = {
402 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ 419 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -933,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
933 950
934 cpuc = &__get_cpu_var(cpu_hw_events); 951 cpuc = &__get_cpu_var(cpu_hw_events);
935 952
953 /*
954 * Some chipsets need to unmask the LVTPC in a particular spot
955 * inside the nmi handler. As a result, the unmasking was pushed
956 * into all the nmi handlers.
957 *
958 * This handler doesn't seem to have any issues with the unmasking
959 * so it was left at the top.
960 */
961 apic_write(APIC_LVTPC, APIC_DM_NMI);
962
936 intel_pmu_disable_all(); 963 intel_pmu_disable_all();
937 handled = intel_pmu_drain_bts_buffer(); 964 handled = intel_pmu_drain_bts_buffer();
938 status = intel_pmu_get_status(); 965 status = intel_pmu_get_status();
@@ -998,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
998 struct hw_perf_event *hwc = &event->hw; 1025 struct hw_perf_event *hwc = &event->hw;
999 unsigned int hw_event, bts_event; 1026 unsigned int hw_event, bts_event;
1000 1027
1028 if (event->attr.freq)
1029 return NULL;
1030
1001 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; 1031 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
1002 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 1032 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
1003 1033
@@ -1305,7 +1335,7 @@ static void intel_clovertown_quirks(void)
1305 * AJ106 could possibly be worked around by not allowing LBR 1335 * AJ106 could possibly be worked around by not allowing LBR
1306 * usage from PEBS, including the fixup. 1336 * usage from PEBS, including the fixup.
1307 * AJ68 could possibly be worked around by always programming 1337 * AJ68 could possibly be worked around by always programming
1308 * a pebs_event_reset[0] value and coping with the lost events. 1338 * a pebs_event_reset[0] value and coping with the lost events.
1309 * 1339 *
1310 * But taken together it might just make sense to not enable PEBS on 1340 * But taken together it might just make sense to not enable PEBS on
1311 * these chips. 1341 * these chips.
@@ -1409,6 +1439,23 @@ static __init int intel_pmu_init(void)
1409 x86_pmu.percore_constraints = intel_nehalem_percore_constraints; 1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1410 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1411 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1441 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442
1443 /* UOPS_ISSUED.STALLED_CYCLES */
1444 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1445 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1446 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1447
1448 if (ebx & 0x40) {
1449 /*
1450 * Erratum AAJ80 detected, we work it around by using
1451 * the BR_MISP_EXEC.ANY event. This will over-count
1452 * branch-misses, but it's still much better than the
1453 * architectural event which is often completely bogus:
1454 */
1455 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1456
1457 pr_cont("erratum AAJ80 worked around, ");
1458 }
1412 pr_cont("Nehalem events, "); 1459 pr_cont("Nehalem events, ");
1413 break; 1460 break;
1414 1461
@@ -1425,6 +1472,7 @@ static __init int intel_pmu_init(void)
1425 1472
1426 case 37: /* 32 nm nehalem, "Clarkdale" */ 1473 case 37: /* 32 nm nehalem, "Clarkdale" */
1427 case 44: /* 32 nm nehalem, "Gulftown" */ 1474 case 44: /* 32 nm nehalem, "Gulftown" */
1475 case 47: /* 32 nm Xeon E7 */
1428 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1476 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1429 sizeof(hw_cache_event_ids)); 1477 sizeof(hw_cache_event_ids));
1430 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, 1478 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -1437,6 +1485,12 @@ static __init int intel_pmu_init(void)
1437 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1438 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1439 x86_pmu.extra_regs = intel_westmere_extra_regs; 1487 x86_pmu.extra_regs = intel_westmere_extra_regs;
1488
1489 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1491 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1492 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1493
1440 pr_cont("Westmere events, "); 1494 pr_cont("Westmere events, ");
1441 break; 1495 break;
1442 1496
@@ -1448,6 +1502,12 @@ static __init int intel_pmu_init(void)
1448 1502
1449 x86_pmu.event_constraints = intel_snb_event_constraints; 1503 x86_pmu.event_constraints = intel_snb_event_constraints;
1450 x86_pmu.pebs_constraints = intel_snb_pebs_events; 1504 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1505
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1508 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1509 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
1510
1451 pr_cont("SandyBridge events, "); 1511 pr_cont("SandyBridge events, ");
1452 break; 1512 break;
1453 1513
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d32..ead584fb6a7d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), 468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
470 .escr_emask = 470 .escr_emask =
471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), 471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
472 .cntr = { {12, 13, 16}, {14, 15, 17} }, 472 .cntr = { {12, 13, 16}, {14, 15, 17} },
473 }, 473 },
474 [P4_EVENT_X87_ASSIST] = { 474 [P4_EVENT_X87_ASSIST] = {
@@ -912,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
912 int idx, handled = 0; 912 int idx, handled = 0;
913 u64 val; 913 u64 val;
914 914
915 data.addr = 0; 915 perf_sample_data_init(&data, 0);
916 data.raw = NULL;
917 916
918 cpuc = &__get_cpu_var(cpu_hw_events); 917 cpuc = &__get_cpu_var(cpu_hw_events);
919 918
@@ -947,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
947 if (!x86_perf_event_set_period(event)) 946 if (!x86_perf_event_set_period(event))
948 continue; 947 continue;
949 if (perf_event_overflow(event, 1, &data, regs)) 948 if (perf_event_overflow(event, 1, &data, regs))
950 p4_pmu_disable_event(event); 949 x86_pmu_stop(event, 0);
951 } 950 }
952 951
953 if (handled) { 952 if (handled)
954 /* p4 quirk: unmask it again */
955 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
956 inc_irq_stat(apic_perf_irqs); 953 inc_irq_stat(apic_perf_irqs);
957 } 954
955 /*
956 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
957 * been observed that the OVF bit flag has to be cleared first _before_
958 * the LVTPC can be unmasked.
959 *
960 * The reason is the NMI line will continue to be asserted while the OVF
961 * bit is set. This causes a second NMI to generate if the LVTPC is
962 * unmasked before the OVF bit is cleared, leading to unknown NMI
963 * messages.
964 */
965 apic_write(APIC_LVTPC, APIC_DM_NMI);
958 966
959 return handled; 967 return handled;
960} 968}
@@ -1188,7 +1196,7 @@ static __init int p4_pmu_init(void)
1188{ 1196{
1189 unsigned int low, high; 1197 unsigned int low, high;
1190 1198
1191 /* If we get stripped -- indexig fails */ 1199 /* If we get stripped -- indexing fails */
1192 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1200 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
1193 1201
1194 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1202 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 706a9fb46a58..690bc8461835 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -369,6 +369,7 @@ static struct of_ioapic_type of_ioapic_type[] =
369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, 369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
370 u32 *out_hwirq, u32 *out_type) 370 u32 *out_hwirq, u32 *out_type)
371{ 371{
372 struct mp_ioapic_gsi *gsi_cfg;
372 struct io_apic_irq_attr attr; 373 struct io_apic_irq_attr attr;
373 struct of_ioapic_type *it; 374 struct of_ioapic_type *it;
374 u32 line, idx, type; 375 u32 line, idx, type;
@@ -378,7 +379,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
378 379
379 line = *intspec; 380 line = *intspec;
380 idx = (u32) id->priv; 381 idx = (u32) id->priv;
381 *out_hwirq = line + mp_gsi_routing[idx].gsi_base; 382 gsi_cfg = mp_ioapic_gsi_routing(idx);
383 *out_hwirq = line + gsi_cfg->gsi_base;
382 384
383 intspec++; 385 intspec++;
384 type = *intspec; 386 type = *intspec;
@@ -391,7 +393,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
391 393
392 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); 394 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
393 395
394 return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); 396 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
395} 397}
396 398
397static void __init ioapic_add_ofnode(struct device_node *np) 399static void __init ioapic_add_ofnode(struct device_node *np)
@@ -407,7 +409,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
407 } 409 }
408 410
409 for (i = 0; i < nr_ioapics; i++) { 411 for (i = 0; i < nr_ioapics; i++) {
410 if (r.start == mp_ioapics[i].apicaddr) { 412 if (r.start == mpc_ioapic_addr(i)) {
411 struct irq_domain *id; 413 struct irq_domain *id;
412 414
413 id = kzalloc(sizeof(*id), GFP_KERNEL); 415 id = kzalloc(sizeof(*id), GFP_KERNEL);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da4..1aae78f775fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
135} 135}
136EXPORT_SYMBOL_GPL(print_context_stack_bp); 136EXPORT_SYMBOL_GPL(print_context_stack_bp);
137 137
138
139static void
140print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
141{
142 printk(data);
143 print_symbol(msg, symbol);
144 printk("\n");
145}
146
147static void print_trace_warning(void *data, char *msg)
148{
149 printk("%s%s\n", (char *)data, msg);
150}
151
152static int print_trace_stack(void *data, char *name) 138static int print_trace_stack(void *data, char *name)
153{ 139{
154 printk("%s <%s> ", (char *)data, name); 140 printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
166} 152}
167 153
168static const struct stacktrace_ops print_trace_ops = { 154static const struct stacktrace_ops print_trace_ops = {
169 .warning = print_trace_warning,
170 .warning_symbol = print_trace_warning_symbol,
171 .stack = print_trace_stack, 155 .stack = print_trace_stack,
172 .address = print_trace_address, 156 .address = print_trace_address,
173 .walk_stack = print_context_stack, 157 .walk_stack = print_context_stack,
@@ -279,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
279 printk("DEBUG_PAGEALLOC"); 263 printk("DEBUG_PAGEALLOC");
280#endif 264#endif
281 printk("\n"); 265 printk("\n");
282 sysfs_printk_last_file();
283 if (notify_die(DIE_OOPS, str, regs, err, 266 if (notify_die(DIE_OOPS, str, regs, err,
284 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 267 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
285 return 1; 268 return 1;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a57468..0ba15a6cc57e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -260,9 +260,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
260 return mod_code_status; 260 return mod_code_status;
261} 261}
262 262
263static unsigned char *ftrace_nop_replace(void) 263static const unsigned char *ftrace_nop_replace(void)
264{ 264{
265 return ideal_nop5; 265 return ideal_nops[NOP_ATOMIC5];
266} 266}
267 267
268static int 268static int
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb361931..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
23static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
24{ 24{
25 /* Initialize 32bit specific setup functions */ 25 /* Initialize 32bit specific setup functions */
26 x86_init.resources.probe_roms = probe_roms;
27 x86_init.resources.reserve_resources = i386_reserve_resources; 26 x86_init.resources.reserve_resources = i386_reserve_resources;
28 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
29 28
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e086..6781765b3a0d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -217,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
217/* 217/*
218 * Common hpet info 218 * Common hpet info
219 */ 219 */
220static unsigned long hpet_period; 220static unsigned long hpet_freq;
221 221
222static void hpet_legacy_set_mode(enum clock_event_mode mode, 222static void hpet_legacy_set_mode(enum clock_event_mode mode,
223 struct clock_event_device *evt); 223 struct clock_event_device *evt);
@@ -232,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
233 .set_mode = hpet_legacy_set_mode, 233 .set_mode = hpet_legacy_set_mode,
234 .set_next_event = hpet_legacy_next_event, 234 .set_next_event = hpet_legacy_next_event,
235 .shift = 32,
236 .irq = 0, 235 .irq = 0,
237 .rating = 50, 236 .rating = 50,
238}; 237};
@@ -290,28 +289,12 @@ static void hpet_legacy_clockevent_register(void)
290 hpet_enable_legacy_int(); 289 hpet_enable_legacy_int();
291 290
292 /* 291 /*
293 * The mult factor is defined as (include/linux/clockchips.h)
294 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
295 * hpet_period is in units of femtoseconds (per cycle), so
296 * mult/2^shift = cyc/ns = 10^6/hpet_period
297 * mult = (10^6 * 2^shift)/hpet_period
298 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
299 */
300 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
301 hpet_period, hpet_clockevent.shift);
302 /* Calculate the min / max delta */
303 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
304 &hpet_clockevent);
305 /* Setup minimum reprogramming delta. */
306 hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
307 &hpet_clockevent);
308
309 /*
310 * Start hpet with the boot cpu mask and make it 292 * Start hpet with the boot cpu mask and make it
311 * global after the IO_APIC has been initialized. 293 * global after the IO_APIC has been initialized.
312 */ 294 */
313 hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); 295 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
314 clockevents_register_device(&hpet_clockevent); 296 clockevents_config_and_register(&hpet_clockevent, hpet_freq,
297 HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
315 global_clock_event = &hpet_clockevent; 298 global_clock_event = &hpet_clockevent;
316 printk(KERN_DEBUG "hpet clockevent registered\n"); 299 printk(KERN_DEBUG "hpet clockevent registered\n");
317} 300}
@@ -549,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
549static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) 532static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
550{ 533{
551 struct clock_event_device *evt = &hdev->evt; 534 struct clock_event_device *evt = &hdev->evt;
552 uint64_t hpet_freq;
553 535
554 WARN_ON(cpu != smp_processor_id()); 536 WARN_ON(cpu != smp_processor_id());
555 if (!(hdev->flags & HPET_DEV_VALID)) 537 if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
571 553
572 evt->set_mode = hpet_msi_set_mode; 554 evt->set_mode = hpet_msi_set_mode;
573 evt->set_next_event = hpet_msi_next_event; 555 evt->set_next_event = hpet_msi_next_event;
574 evt->shift = 32;
575
576 /*
577 * The period is a femto seconds value. We need to calculate the
578 * scaled math multiplication factor for nanosecond to hpet tick
579 * conversion.
580 */
581 hpet_freq = FSEC_PER_SEC;
582 do_div(hpet_freq, hpet_period);
583 evt->mult = div_sc((unsigned long) hpet_freq,
584 NSEC_PER_SEC, evt->shift);
585 /* Calculate the max delta */
586 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
587 /* 5 usec minimum reprogramming delta. */
588 evt->min_delta_ns = 5000;
589
590 evt->cpumask = cpumask_of(hdev->cpu); 556 evt->cpumask = cpumask_of(hdev->cpu);
591 clockevents_register_device(evt); 557
558 clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
559 0x7FFFFFFF);
592} 560}
593 561
594#ifdef CONFIG_HPET 562#ifdef CONFIG_HPET
@@ -792,7 +760,6 @@ static struct clocksource clocksource_hpet = {
792static int hpet_clocksource_register(void) 760static int hpet_clocksource_register(void)
793{ 761{
794 u64 start, now; 762 u64 start, now;
795 u64 hpet_freq;
796 cycle_t t1; 763 cycle_t t1;
797 764
798 /* Start the counter */ 765 /* Start the counter */
@@ -819,24 +786,7 @@ static int hpet_clocksource_register(void)
819 return -ENODEV; 786 return -ENODEV;
820 } 787 }
821 788
822 /*
823 * The definition of mult is (include/linux/clocksource.h)
824 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
825 * so we first need to convert hpet_period to ns/cyc units:
826 * mult/2^shift = ns/cyc = hpet_period/10^6
827 * mult = (hpet_period * 2^shift)/10^6
828 * mult = (hpet_period << shift)/FSEC_PER_NSEC
829 */
830
831 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
832 *
833 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
834 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
835 */
836 hpet_freq = FSEC_PER_SEC;
837 do_div(hpet_freq, hpet_period);
838 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); 789 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
839
840 return 0; 790 return 0;
841} 791}
842 792
@@ -845,7 +795,9 @@ static int hpet_clocksource_register(void)
845 */ 795 */
846int __init hpet_enable(void) 796int __init hpet_enable(void)
847{ 797{
798 unsigned long hpet_period;
848 unsigned int id; 799 unsigned int id;
800 u64 freq;
849 int i; 801 int i;
850 802
851 if (!is_hpet_capable()) 803 if (!is_hpet_capable())
@@ -884,6 +836,14 @@ int __init hpet_enable(void)
884 goto out_nohpet; 836 goto out_nohpet;
885 837
886 /* 838 /*
839 * The period is a femto seconds value. Convert it to a
840 * frequency.
841 */
842 freq = FSEC_PER_SEC;
843 do_div(freq, hpet_period);
844 hpet_freq = freq;
845
846 /*
887 * Read the HPET ID register to retrieve the IRQ routing 847 * Read the HPET ID register to retrieve the IRQ routing
888 * information and the number of channels 848 * information and the number of channels
889 */ 849 */
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..fb66dc9e36cb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer, 94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event, 95 .set_next_event = pit_next_event,
96 .shift = 32,
97 .irq = 0, 96 .irq = 0,
98}; 97};
99 98
@@ -108,90 +107,12 @@ void __init setup_pit_timer(void)
108 * IO_APIC has been initialized. 107 * IO_APIC has been initialized.
109 */ 108 */
110 pit_ce.cpumask = cpumask_of(smp_processor_id()); 109 pit_ce.cpumask = cpumask_of(smp_processor_id());
111 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
112 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
113 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
114 110
115 clockevents_register_device(&pit_ce); 111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
116 global_clock_event = &pit_ce; 112 global_clock_event = &pit_ce;
117} 113}
118 114
119#ifndef CONFIG_X86_64 115#ifndef CONFIG_X86_64
120/*
121 * Since the PIT overflows every tick, its not very useful
122 * to just read by itself. So use jiffies to emulate a free
123 * running counter:
124 */
125static cycle_t pit_read(struct clocksource *cs)
126{
127 static int old_count;
128 static u32 old_jifs;
129 unsigned long flags;
130 int count;
131 u32 jifs;
132
133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /*
135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine
137 * by having side effects on state that we cannot undo if
138 * there is a collision on the seqlock and our caller has to
139 * retry. (Namely, old_jifs and old_count.) So we must treat
140 * jiffies as volatile despite the lock. We read jiffies
141 * before latching the timer count to guarantee that although
142 * the jiffies value might be older than the count (that is,
143 * the counter may underflow between the last point where
144 * jiffies was incremented and the point where we latch the
145 * count), it cannot be newer.
146 */
147 jifs = jiffies;
148 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
149 count = inb_pit(PIT_CH0); /* read the latched count */
150 count |= inb_pit(PIT_CH0) << 8;
151
152 /* VIA686a test code... reset the latch if count > max + 1 */
153 if (count > LATCH) {
154 outb_pit(0x34, PIT_MODE);
155 outb_pit(LATCH & 0xff, PIT_CH0);
156 outb_pit(LATCH >> 8, PIT_CH0);
157 count = LATCH - 1;
158 }
159
160 /*
161 * It's possible for count to appear to go the wrong way for a
162 * couple of reasons:
163 *
164 * 1. The timer counter underflows, but we haven't handled the
165 * resulting interrupt and incremented jiffies yet.
166 * 2. Hardware problem with the timer, not giving us continuous time,
167 * the counter does small "jumps" upwards on some Pentium systems,
168 * (see c't 95/10 page 335 for Neptun bug.)
169 *
170 * Previous attempts to handle these cases intelligently were
171 * buggy, so we just do the simple thing now.
172 */
173 if (count > old_count && jifs == old_jifs)
174 count = old_count;
175
176 old_count = count;
177 old_jifs = jifs;
178
179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180
181 count = (LATCH - 1) - count;
182
183 return (cycle_t)(jifs * LATCH) + count;
184}
185
186static struct clocksource pit_cs = {
187 .name = "pit",
188 .rating = 110,
189 .read = pit_read,
190 .mask = CLOCKSOURCE_MASK(32),
191 .mult = 0,
192 .shift = 20,
193};
194
195static int __init init_pit_clocksource(void) 116static int __init init_pit_clocksource(void)
196{ 117{
197 /* 118 /*
@@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void)
205 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
206 return 0; 127 return 0;
207 128
208 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); 129 return clocksource_i8253_init();
209
210 return clocksource_register(&pit_cs);
211} 130}
212arch_initcall(init_pit_clocksource); 131arch_initcall(init_pit_clocksource);
213
214#endif /* !CONFIG_X86_64 */ 132#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78dc..6c0802eb2f7f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
249 249
250 data = irq_desc_get_irq_data(desc); 250 data = irq_desc_get_irq_data(desc);
251 affinity = data->affinity; 251 affinity = data->affinity;
252 if (!irq_has_action(irq) || 252 if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
253 cpumask_subset(affinity, cpu_online_mask)) { 253 cpumask_subset(affinity, cpu_online_mask)) {
254 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
255 continue; 255 continue;
@@ -276,7 +276,8 @@ void fixup_irqs(void)
276 else if (!(warned++)) 276 else if (!(warned++))
277 set_affinity = 0; 277 set_affinity = 0;
278 278
279 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask) 279 if (!irqd_can_move_in_process_context(data) &&
280 !irqd_irq_disabled(data) && chip->irq_unmask)
280 chip->irq_unmask(data); 281 chip->irq_unmask(data);
281 282
282 raw_spin_unlock(&desc->lock); 283 raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba90..3fee346ef545 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
34 code.offset = entry->target - 34 code.offset = entry->target -
35 (entry->code + JUMP_LABEL_NOP_SIZE); 35 (entry->code + JUMP_LABEL_NOP_SIZE);
36 } else 36 } else
37 memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); 37 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
38 get_online_cpus(); 38 get_online_cpus();
39 mutex_lock(&text_mutex); 39 mutex_lock(&text_mutex);
40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); 40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
44 44
45void arch_jump_label_text_poke_early(jump_label_t addr) 45void arch_jump_label_text_poke_early(jump_label_t addr)
46{ 46{
47 text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); 47 text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
48 JUMP_LABEL_NOP_SIZE);
48} 49}
49 50
50#endif 51#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c969fd9d1566..f1a6244d7d93 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1183 struct pt_regs *regs) 1183 struct pt_regs *regs)
1184{ 1184{
1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 unsigned long flags;
1186 1187
1187 /* This is possible if op is under delayed unoptimizing */ 1188 /* This is possible if op is under delayed unoptimizing */
1188 if (kprobe_disabled(&op->kp)) 1189 if (kprobe_disabled(&op->kp))
1189 return; 1190 return;
1190 1191
1191 preempt_disable(); 1192 local_irq_save(flags);
1192 if (kprobe_running()) { 1193 if (kprobe_running()) {
1193 kprobes_inc_nmissed_count(&op->kp); 1194 kprobes_inc_nmissed_count(&op->kp);
1194 } else { 1195 } else {
@@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1207 opt_pre_handler(&op->kp, regs); 1208 opt_pre_handler(&op->kp, regs);
1208 __this_cpu_write(current_kprobe, NULL); 1209 __this_cpu_write(current_kprobe, NULL);
1209 } 1210 }
1210 preempt_enable_no_resched(); 1211 local_irq_restore(flags);
1211} 1212}
1212 1213
1213static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) 1214static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07a..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
26#include <asm/x86_init.h> 26#include <asm/x86_init.h>
27#include <asm/reboot.h> 27#include <asm/reboot.h>
28 28
29#define KVM_SCALE 22
30
31static int kvmclock = 1; 29static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 30static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 31static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
120 .read = kvm_clock_get_cycles, 118 .read = kvm_clock_get_cycles,
121 .rating = 400, 119 .rating = 400,
122 .mask = CLOCKSOURCE_MASK(64), 120 .mask = CLOCKSOURCE_MASK(64),
123 .mult = 1 << KVM_SCALE,
124 .shift = KVM_SCALE,
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 121 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 122};
127 123
@@ -203,7 +199,7 @@ void __init kvmclock_init(void)
203 machine_ops.crash_shutdown = kvm_crash_shutdown; 199 machine_ops.crash_shutdown = kvm_crash_shutdown;
204#endif 200#endif
205 kvm_get_preset_lpj(); 201 kvm_get_preset_lpj();
206 clocksource_register(&kvm_clock); 202 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
207 pv_info.paravirt_enabled = 1; 203 pv_info.paravirt_enabled = 1;
208 pv_info.name = "KVM"; 204 pv_info.name = "KVM";
209 205
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf1..52f256f2cc81 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/jump_label.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/page.h> 30#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646bf..9103b89c145a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
285 intsrc.type = MP_INTSRC; 285 intsrc.type = MP_INTSRC;
286 intsrc.irqflag = 0; /* conforming */ 286 intsrc.irqflag = 0; /* conforming */
287 intsrc.srcbus = 0; 287 intsrc.srcbus = 0;
288 intsrc.dstapic = mp_ioapics[0].apicid; 288 intsrc.dstapic = mpc_ioapic_id(0);
289 289
290 intsrc.irqtype = mp_INT; 290 intsrc.irqtype = mp_INT;
291 291
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
715 } 715 }
716} 716}
717 717
718static int 718static int __init
719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
720{ 720{
721 int ret = 0;
722
723 if (!mpc_new_phys || count <= mpc_new_length) { 721 if (!mpc_new_phys || count <= mpc_new_length) {
724 WARN(1, "update_mptable: No spare slots (length: %x)\n", count); 722 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
725 return -1; 723 return -1;
726 } 724 }
727 725
728 return ret; 726 return 0;
729} 727}
730#else /* CONFIG_X86_IO_APIC */ 728#else /* CONFIG_X86_IO_APIC */
731static 729static
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9ea999a4dcc1..b49d00da2aed 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask)
68} 68}
69EXPORT_SYMBOL(dma_set_mask); 69EXPORT_SYMBOL(dma_set_mask);
70 70
71#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
72static __initdata void *dma32_bootmem_ptr;
73static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
74
75static int __init parse_dma32_size_opt(char *p)
76{
77 if (!p)
78 return -EINVAL;
79 dma32_bootmem_size = memparse(p, &p);
80 return 0;
81}
82early_param("dma32_size", parse_dma32_size_opt);
83
84void __init dma32_reserve_bootmem(void)
85{
86 unsigned long size, align;
87 if (max_pfn <= MAX_DMA32_PFN)
88 return;
89
90 /*
91 * check aperture_64.c allocate_aperture() for reason about
92 * using 512M as goal
93 */
94 align = 64ULL<<20;
95 size = roundup(dma32_bootmem_size, align);
96 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
97 512ULL<<20);
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(dma32_bootmem_ptr);
103 if (dma32_bootmem_ptr)
104 dma32_bootmem_size = size;
105 else
106 dma32_bootmem_size = 0;
107}
108static void __init dma32_free_bootmem(void)
109{
110
111 if (max_pfn <= MAX_DMA32_PFN)
112 return;
113
114 if (!dma32_bootmem_ptr)
115 return;
116
117 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
118
119 dma32_bootmem_ptr = NULL;
120 dma32_bootmem_size = 0;
121}
122#else
123void __init dma32_reserve_bootmem(void)
124{
125}
126static void __init dma32_free_bootmem(void)
127{
128}
129
130#endif
131
132void __init pci_iommu_alloc(void) 71void __init pci_iommu_alloc(void)
133{ 72{
134 struct iommu_table_entry *p; 73 struct iommu_table_entry *p;
135 74
136 /* free the range so iommu could get some range less than 4G */
137 dma32_free_bootmem();
138
139 sort_iommu_table(__iommu_table, __iommu_table_end); 75 sort_iommu_table(__iommu_table, __iommu_table_end);
140 check_iommu_entries(__iommu_table, __iommu_table_end); 76 check_iommu_entries(__iommu_table, __iommu_table_end);
141 77
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec1181..35ccf75696eb 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
50 struct iommu_table_entry *finish) 50 struct iommu_table_entry *finish)
51{ 51{
52 struct iommu_table_entry *p, *q, *x; 52 struct iommu_table_entry *p, *q, *x;
53 char sym_p[KSYM_SYMBOL_LEN];
54 char sym_q[KSYM_SYMBOL_LEN];
55 53
56 /* Simple cyclic dependency checker. */ 54 /* Simple cyclic dependency checker. */
57 for (p = start; p < finish; p++) { 55 for (p = start; p < finish; p++) {
58 q = find_dependents_of(start, finish, p); 56 q = find_dependents_of(start, finish, p);
59 x = find_dependents_of(start, finish, q); 57 x = find_dependents_of(start, finish, q);
60 if (p == x) { 58 if (p == x) {
61 sprint_symbol(sym_p, (unsigned long)p->detect); 59 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
62 sprint_symbol(sym_q, (unsigned long)q->detect); 60 p->detect, q->detect);
63
64 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
65 " on %s and vice-versa. BREAKING IT.\n",
66 sym_p, sym_q);
67 /* Heavy handed way..*/ 61 /* Heavy handed way..*/
68 x->depend = 0; 62 x->depend = 0;
69 } 63 }
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
72 for (p = start; p < finish; p++) { 66 for (p = start; p < finish; p++) {
73 q = find_dependents_of(p, finish, p); 67 q = find_dependents_of(p, finish, p);
74 if (q && q > p) { 68 if (q && q > p) {
75 sprint_symbol(sym_p, (unsigned long)p->detect); 69 printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
76 sprint_symbol(sym_q, (unsigned long)q->detect); 70 p->detect, q->detect);
77
78 printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
79 "should be called before %s!\n",
80 sym_p, sym_q);
81 } 71 }
82 } 72 }
83} 73}
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}; 74};
75 75
76/* does this oprom support the given pci device, or any of the devices
77 * that the driver supports?
78 */
79static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
80{
81 struct pci_driver *drv = pdev->driver;
82 const struct pci_device_id *id;
83
84 if (pdev->vendor == vendor && pdev->device == device)
85 return true;
86
87 for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
88 if (id->vendor == vendor && id->device == device)
89 break;
90
91 return id && id->vendor;
92}
93
94static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
95 const unsigned char *rom_list)
96{
97 unsigned short device;
98
99 do {
100 if (probe_kernel_address(rom_list, device) != 0)
101 device = 0;
102
103 if (device && match_id(pdev, vendor, device))
104 break;
105
106 rom_list += 2;
107 } while (device);
108
109 return !!device;
110}
111
112static struct resource *find_oprom(struct pci_dev *pdev)
113{
114 struct resource *oprom = NULL;
115 int i;
116
117 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
118 struct resource *res = &adapter_rom_resources[i];
119 unsigned short offset, vendor, device, list, rev;
120 const unsigned char *rom;
121
122 if (res->end == 0)
123 break;
124
125 rom = isa_bus_to_virt(res->start);
126 if (probe_kernel_address(rom + 0x18, offset) != 0)
127 continue;
128
129 if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
130 continue;
131
132 if (probe_kernel_address(rom + offset + 0x6, device) != 0)
133 continue;
134
135 if (match_id(pdev, vendor, device)) {
136 oprom = res;
137 break;
138 }
139
140 if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
141 probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
142 rev >= 3 && list &&
143 probe_list(pdev, vendor, rom + offset + list)) {
144 oprom = res;
145 break;
146 }
147 }
148
149 return oprom;
150}
151
152void *pci_map_biosrom(struct pci_dev *pdev)
153{
154 struct resource *oprom = find_oprom(pdev);
155
156 if (!oprom)
157 return NULL;
158
159 return ioremap(oprom->start, resource_size(oprom));
160}
161EXPORT_SYMBOL(pci_map_biosrom);
162
163void pci_unmap_biosrom(void __iomem *image)
164{
165 iounmap(image);
166}
167EXPORT_SYMBOL(pci_unmap_biosrom);
168
169size_t pci_biosrom_size(struct pci_dev *pdev)
170{
171 struct resource *oprom = find_oprom(pdev);
172
173 return oprom ? resource_size(oprom) : 0;
174}
175EXPORT_SYMBOL(pci_biosrom_size);
176
76#define ROMSIGNATURE 0xaa55 177#define ROMSIGNATURE 0xaa55
77 178
78static int __init romsignature(const unsigned char *rom) 179static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7ab..88a90a977f8e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
449void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 449void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
450{ 450{
451 if (!need_resched()) { 451 if (!need_resched()) {
452 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 452 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
453 clflush((void *)&current_thread_info()->flags); 453 clflush((void *)&current_thread_info()->flags);
454 454
455 __monitor((void *)&current_thread_info()->flags, 0, 0); 455 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +465,7 @@ static void mwait_idle(void)
465 if (!need_resched()) { 465 if (!need_resched()) {
466 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 466 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
467 trace_cpu_idle(1, smp_processor_id()); 467 trace_cpu_idle(1, smp_processor_id());
468 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 468 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
469 clflush((void *)&current_thread_info()->flags); 469 clflush((void *)&current_thread_info()->flags);
470 470
471 __monitor((void *)&current_thread_info()->flags, 0, 0); 471 __monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72a..807c2a2b80f1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608 unsigned len, type; 608 unsigned len, type;
609 struct perf_event *bp; 609 struct perf_event *bp;
610 610
611 if (ptrace_get_breakpoints(tsk) < 0)
612 return -ESRCH;
613
611 data &= ~DR_CONTROL_RESERVED; 614 data &= ~DR_CONTROL_RESERVED;
612 old_dr7 = ptrace_get_dr7(thread->ptrace_bps); 615 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
613restore: 616restore:
@@ -655,6 +658,9 @@ restore:
655 } 658 }
656 goto restore; 659 goto restore;
657 } 660 }
661
662 ptrace_put_breakpoints(tsk);
663
658 return ((orig_ret < 0) ? orig_ret : rc); 664 return ((orig_ret < 0) ? orig_ret : rc);
659} 665}
660 666
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
668 674
669 if (n < HBP_NUM) { 675 if (n < HBP_NUM) {
670 struct perf_event *bp; 676 struct perf_event *bp;
677
678 if (ptrace_get_breakpoints(tsk) < 0)
679 return -ESRCH;
680
671 bp = thread->ptrace_bps[n]; 681 bp = thread->ptrace_bps[n];
672 if (!bp) 682 if (!bp)
673 return 0; 683 val = 0;
674 val = bp->hw.info.address; 684 else
685 val = bp->hw.info.address;
686
687 ptrace_put_breakpoints(tsk);
675 } else if (n == 6) { 688 } else if (n == 6) {
676 val = thread->debugreg6; 689 val = thread->debugreg6;
677 } else if (n == 7) { 690 } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
686 struct perf_event *bp; 699 struct perf_event *bp;
687 struct thread_struct *t = &tsk->thread; 700 struct thread_struct *t = &tsk->thread;
688 struct perf_event_attr attr; 701 struct perf_event_attr attr;
702 int err = 0;
703
704 if (ptrace_get_breakpoints(tsk) < 0)
705 return -ESRCH;
689 706
690 if (!t->ptrace_bps[nr]) { 707 if (!t->ptrace_bps[nr]) {
691 ptrace_breakpoint_init(&attr); 708 ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
709 * writing for the user. And anyway this is the previous 726 * writing for the user. And anyway this is the previous
710 * behaviour. 727 * behaviour.
711 */ 728 */
712 if (IS_ERR(bp)) 729 if (IS_ERR(bp)) {
713 return PTR_ERR(bp); 730 err = PTR_ERR(bp);
731 goto put;
732 }
714 733
715 t->ptrace_bps[nr] = bp; 734 t->ptrace_bps[nr] = bp;
716 } else { 735 } else {
717 int err;
718
719 bp = t->ptrace_bps[nr]; 736 bp = t->ptrace_bps[nr];
720 737
721 attr = bp->attr; 738 attr = bp->attr;
722 attr.bp_addr = addr; 739 attr.bp_addr = addr;
723 err = modify_user_hw_breakpoint(bp, &attr); 740 err = modify_user_hw_breakpoint(bp, &attr);
724 if (err)
725 return err;
726 } 741 }
727 742
728 743put:
729 return 0; 744 ptrace_put_breakpoints(tsk);
745 return err;
730} 746}
731 747
732/* 748/*
@@ -1347,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1347 * We must return the syscall number to actually look up in the table. 1363 * We must return the syscall number to actually look up in the table.
1348 * This can be -1L to skip running any syscall at all. 1364 * This can be -1L to skip running any syscall at all.
1349 */ 1365 */
1350asmregparm long syscall_trace_enter(struct pt_regs *regs) 1366long syscall_trace_enter(struct pt_regs *regs)
1351{ 1367{
1352 long ret = 0; 1368 long ret = 0;
1353 1369
@@ -1392,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1392 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1393} 1409}
1394 1410
1395asmregparm void syscall_trace_leave(struct pt_regs *regs) 1411void syscall_trace_leave(struct pt_regs *regs)
1396{ 1412{
1397 bool step; 1413 bool step;
1398 1414
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5b..0c016f727695 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
36 36
37static const struct desc_ptr no_idt = {}; 37static const struct desc_ptr no_idt = {};
38static int reboot_mode; 38static int reboot_mode;
39enum reboot_type reboot_type = BOOT_KBD; 39enum reboot_type reboot_type = BOOT_ACPI;
40int reboot_force; 40int reboot_force;
41 41
42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
478{ 478{
479} 479}
480 480
481/*
482 * Windows compatible x86 hardware expects the following on reboot:
483 *
484 * 1) If the FADT has the ACPI reboot register flag set, try it
485 * 2) If still alive, write to the keyboard controller
486 * 3) If still alive, write to the ACPI reboot register again
487 * 4) If still alive, write to the keyboard controller again
488 *
489 * If the machine is still alive at this stage, it gives up. We default to
490 * following the same pattern, except that if we're still alive after (4) we'll
491 * try to force a triple fault and then cycle between hitting the keyboard
492 * controller and doing that
493 */
481static void native_machine_emergency_restart(void) 494static void native_machine_emergency_restart(void)
482{ 495{
483 int i; 496 int i;
497 int attempt = 0;
498 int orig_reboot_type = reboot_type;
484 499
485 if (reboot_emergency) 500 if (reboot_emergency)
486 emergency_vmx_disable_all(); 501 emergency_vmx_disable_all();
@@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void)
502 outb(0xfe, 0x64); /* pulse reset low */ 517 outb(0xfe, 0x64); /* pulse reset low */
503 udelay(50); 518 udelay(50);
504 } 519 }
520 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
521 attempt = 1;
522 reboot_type = BOOT_ACPI;
523 } else {
524 reboot_type = BOOT_TRIPLE;
525 }
526 break;
505 527
506 case BOOT_TRIPLE: 528 case BOOT_TRIPLE:
507 load_idt(&no_idt); 529 load_idt(&no_idt);
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
index 29092b38d816..1d5c46df0d78 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/kernel/reboot_32.S
@@ -21,26 +21,26 @@ r_base = .
21 /* Get our own relocated address */ 21 /* Get our own relocated address */
22 call 1f 22 call 1f
231: popl %ebx 231: popl %ebx
24 subl $1b, %ebx 24 subl $(1b - r_base), %ebx
25 25
26 /* Compute the equivalent real-mode segment */ 26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx 27 movl %ebx, %ecx
28 shrl $4, %ecx 28 shrl $4, %ecx
29 29
30 /* Patch post-real-mode segment jump */ 30 /* Patch post-real-mode segment jump */
31 movw dispatch_table(%ebx,%eax,2),%ax 31 movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
32 movw %ax, 101f(%ebx) 32 movw %ax, (101f - r_base)(%ebx)
33 movw %cx, 102f(%ebx) 33 movw %cx, (102f - r_base)(%ebx)
34 34
35 /* Set up the IDT for real mode. */ 35 /* Set up the IDT for real mode. */
36 lidtl machine_real_restart_idt(%ebx) 36 lidtl (machine_real_restart_idt - r_base)(%ebx)
37 37
38 /* 38 /*
39 * Set up a GDT from which we can load segment descriptors for real 39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to 40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors. 41 * prepare the descriptors.
42 */ 42 */
43 lgdtl machine_real_restart_gdt(%ebx) 43 lgdtl (machine_real_restart_gdt - r_base)(%ebx)
44 44
45 /* 45 /*
46 * Load the data segment registers with 16-bit compatible values 46 * Load the data segment registers with 16-bit compatible values
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4be9b398470e..a3e5948670c2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
691 691
692void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
693{ 693{
694 unsigned long flags;
695
696#ifdef CONFIG_X86_32 694#ifdef CONFIG_X86_32
697 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
698 visws_early_detect(); 696 visws_early_detect();
@@ -948,6 +946,8 @@ void __init setup_arch(char **cmdline_p)
948 if (init_ohci1394_dma_early) 946 if (init_ohci1394_dma_early)
949 init_ohci1394_dma_on_all_controllers(); 947 init_ohci1394_dma_on_all_controllers();
950#endif 948#endif
949 /* Allocate bigger log buffer */
950 setup_log_buf(1);
951 951
952 reserve_initrd(); 952 reserve_initrd();
953 953
@@ -966,7 +966,6 @@ void __init setup_arch(char **cmdline_p)
966 966
967 initmem_init(); 967 initmem_init();
968 memblock_find_dma_reserve(); 968 memblock_find_dma_reserve();
969 dma32_reserve_bootmem();
970 969
971#ifdef CONFIG_KVM_CLOCK 970#ifdef CONFIG_KVM_CLOCK
972 kvmclock_init(); 971 kvmclock_init();
@@ -1041,9 +1040,7 @@ void __init setup_arch(char **cmdline_p)
1041 1040
1042 mcheck_init(); 1041 mcheck_init();
1043 1042
1044 local_irq_save(flags); 1043 arch_init_ideal_nops();
1045 arch_init_ideal_nop5();
1046 local_irq_restore(flags);
1047} 1044}
1048 1045
1049#ifdef CONFIG_X86_32 1046#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..40a24932a8a1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
601 goto badframe; 601 goto badframe;
602 602
603 sigdelsetmask(&set, ~_BLOCKABLE); 603 sigdelsetmask(&set, ~_BLOCKABLE);
604 spin_lock_irq(&current->sighand->siglock); 604 set_current_blocked(&set);
605 current->blocked = set;
606 recalc_sigpending();
607 spin_unlock_irq(&current->sighand->siglock);
608 605
609 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 606 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
610 goto badframe; 607 goto badframe;
@@ -682,6 +679,7 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 sigset_t *oldset, struct pt_regs *regs) 680 sigset_t *oldset, struct pt_regs *regs)
684{ 681{
682 sigset_t blocked;
685 int ret; 683 int ret;
686 684
687 /* Are we from a system call? */ 685 /* Are we from a system call? */
@@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
741 */ 739 */
742 regs->flags &= ~X86_EFLAGS_TF; 740 regs->flags &= ~X86_EFLAGS_TF;
743 741
744 spin_lock_irq(&current->sighand->siglock); 742 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
745 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
746 if (!(ka->sa.sa_flags & SA_NODEFER)) 743 if (!(ka->sa.sa_flags & SA_NODEFER))
747 sigaddset(&current->blocked, sig); 744 sigaddset(&blocked, sig);
748 recalc_sigpending(); 745 set_current_blocked(&blocked);
749 spin_unlock_irq(&current->sighand->siglock);
750 746
751 tracehook_signal_handler(sig, info, ka, regs, 747 tracehook_signal_handler(sig, info, ka, regs,
752 test_thread_flag(TIF_SINGLESTEP)); 748 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228d..013e7eba83bb 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
194} 194}
195 195
196/* 196/*
197 * Reschedule call back. Nothing to do, 197 * Reschedule call back.
198 * all the work is done automatically when
199 * we return from the interrupt.
200 */ 198 */
201void smp_reschedule_interrupt(struct pt_regs *regs) 199void smp_reschedule_interrupt(struct pt_regs *regs)
202{ 200{
203 ack_APIC_irq(); 201 ack_APIC_irq();
204 inc_irq_stat(irq_resched_count); 202 inc_irq_stat(irq_resched_count);
203 scheduler_ipi();
205 /* 204 /*
206 * KVM uses this interrupt to force a cpu out of guest mode 205 * KVM uses this interrupt to force a cpu out of guest mode
207 */ 206 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b6..a3c430bdfb60 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
1332 void *mwait_ptr; 1332 void *mwait_ptr;
1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); 1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1334 1334
1335 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) 1335 if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
1336 return; 1336 return;
1337 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) 1337 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1338 return; 1338 return;
1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) 1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1340 return; 1340 return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289d..55d9bc03f696 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
11 11
12static void save_stack_warning(void *data, char *msg)
13{
14}
15
16static void
17save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
18{
19}
20
21static int save_stack_stack(void *data, char *name) 12static int save_stack_stack(void *data, char *name)
22{ 13{
23 return 0; 14 return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 44}
54 45
55static const struct stacktrace_ops save_stack_ops = { 46static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 47 .stack = save_stack_stack,
59 .address = save_stack_address, 48 .address = save_stack_address,
60 .walk_stack = print_context_stack, 49 .walk_stack = print_context_stack,
61}; 50};
62 51
63static const struct stacktrace_ops save_stack_ops_nosched = { 52static const struct stacktrace_ops save_stack_ops_nosched = {
64 .warning = save_stack_warning,
65 .warning_symbol = save_stack_warning_symbol,
66 .stack = save_stack_stack, 53 .stack = save_stack_stack,
67 .address = save_stack_address_nosched, 54 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack, 55 .walk_stack = print_context_stack,
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d5c79d..32cbffb0c494 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,4 @@ ENTRY(sys_call_table)
344 .long sys_open_by_handle_at 344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime 345 .long sys_clock_adjtime
346 .long sys_syncfs 346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 998e972f3b1a..30ac65df7d4e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = {
110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
113 .cpu_vm_mask = CPU_MASK_ALL,
114}; 113};
115 114
116static inline void switch_to_tboot_pt(void) 115static inline void switch_to_tboot_pt(void)
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd1..3f92ce07e525 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
161 } 161 }
162 162
163#endif 163#endif
164 return 0; 164 return ret;
165} 165}
166 166
167static void test_exit(void) 167static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a245937..00cbb272627f 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
27#endif 27#endif
28 28
29unsigned long profile_pc(struct pt_regs *regs) 29unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7dd2e7..6cc6922262af 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
763 ret : clocksource_tsc.cycle_last; 763 ret : clocksource_tsc.cycle_last;
764} 764}
765 765
766#ifdef CONFIG_X86_64
767static cycle_t __vsyscall_fn vread_tsc(void)
768{
769 cycle_t ret;
770
771 /*
772 * Surround the RDTSC by barriers, to make sure it's not
773 * speculated to outside the seqlock critical section and
774 * does not cause time warps:
775 */
776 rdtsc_barrier();
777 ret = (cycle_t)vget_cycles();
778 rdtsc_barrier();
779
780 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
781 ret : __vsyscall_gtod_data.clock.cycle_last;
782}
783#endif
784
785static void resume_tsc(struct clocksource *cs) 766static void resume_tsc(struct clocksource *cs)
786{ 767{
787 clocksource_tsc.cycle_last = 0; 768 clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 624a2016198e..89aed99aafce 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,6 +161,12 @@ SECTIONS
161 161
162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
164 170
165 . = ALIGN(4096); 171 . = ALIGN(4096);
166 __vsyscall_0 = .; 172 __vsyscall_0 = .;
@@ -175,18 +181,6 @@ SECTIONS
175 *(.vsyscall_fn) 181 *(.vsyscall_fn)
176 } 182 }
177 183
178 . = ALIGN(L1_CACHE_BYTES);
179 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
180 *(.vsyscall_gtod_data)
181 }
182
183 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
184 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
185 *(.vsyscall_clock)
186 }
187 vsyscall_clock = VVIRT(.vsyscall_clock);
188
189
190 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { 184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
191 *(.vsyscall_1) 185 *(.vsyscall_1)
192 } 186 }
@@ -194,21 +188,14 @@ SECTIONS
194 *(.vsyscall_2) 188 *(.vsyscall_2)
195 } 189 }
196 190
197 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
198 *(.vgetcpu_mode)
199 }
200 vgetcpu_mode = VVIRT(.vgetcpu_mode);
201
202 . = ALIGN(L1_CACHE_BYTES);
203 .jiffies : AT(VLOAD(.jiffies)) {
204 *(.jiffies)
205 }
206 jiffies = VVIRT(.jiffies);
207
208 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
209 *(.vsyscall_3) 192 *(.vsyscall_3)
210 } 193 }
211 194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198
212 . = __vsyscall_0 + PAGE_SIZE; 199 . = __vsyscall_0 + PAGE_SIZE;
213 200
214#undef VSYSCALL_ADDR 201#undef VSYSCALL_ADDR
@@ -216,6 +203,7 @@ SECTIONS
216#undef VLOAD 203#undef VLOAD
217#undef VVIRT_OFFSET 204#undef VVIRT_OFFSET
218#undef VVIRT 205#undef VVIRT
206#undef EMIT_VVAR
219 207
220#endif /* CONFIG_X86_64 */ 208#endif /* CONFIG_X86_64 */
221 209
@@ -306,6 +294,13 @@ SECTIONS
306 } 294 }
307 295
308 . = ALIGN(8); 296 . = ALIGN(8);
297 .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
298 __apicdrivers = .;
299 *(.apicdrivers);
300 __apicdrivers_end = .;
301 }
302
303 . = ALIGN(8);
309 /* 304 /*
310 * .exit.text is discard at runtime, not link time, to deal with 305 * .exit.text is discard at runtime, not link time, to deal with
311 * references from .altinstructions and .eh_frame 306 * references from .altinstructions and .eh_frame
@@ -319,7 +314,7 @@ SECTIONS
319 } 314 }
320 315
321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 316#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) 317 PERCPU_SECTION(INTERNODE_CACHE_BYTES)
323#endif 318#endif
324 319
325 . = ALIGN(PAGE_SIZE); 320 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 000000000000..a81aa9e9894c
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b694..3e682184d76c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,17 +49,10 @@
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory" 50#define __syscall_clobber "r11","cx","memory"
51 51
52/* 52DEFINE_VVAR(int, vgetcpu_mode);
53 * vsyscall_gtod_data contains data that is : 53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54 * - readonly from vsyscalls
55 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
56 * Try to keep this structure as small as possible to avoid cache line ping pongs
57 */
58int __vgetcpu_mode __section_vgetcpu_mode;
59
60struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
61{ 54{
62 .lock = SEQLOCK_UNLOCKED, 55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
63 .sysctl_enabled = 1, 56 .sysctl_enabled = 1,
64}; 57};
65 58
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
97 */ 90 */
98static __always_inline void do_get_tz(struct timezone * tz) 91static __always_inline void do_get_tz(struct timezone * tz)
99{ 92{
100 *tz = __vsyscall_gtod_data.sys_tz; 93 *tz = VVAR(vsyscall_gtod_data).sys_tz;
101} 94}
102 95
103static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
126 unsigned long mult, shift, nsec; 119 unsigned long mult, shift, nsec;
127 cycle_t (*vread)(void); 120 cycle_t (*vread)(void);
128 do { 121 do {
129 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
130 123
131 vread = __vsyscall_gtod_data.clock.vread; 124 vread = VVAR(vsyscall_gtod_data).clock.vread;
132 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { 125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
133 gettimeofday(tv,NULL); 127 gettimeofday(tv,NULL);
134 return; 128 return;
135 } 129 }
136 130
137 now = vread(); 131 now = vread();
138 base = __vsyscall_gtod_data.clock.cycle_last; 132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
139 mask = __vsyscall_gtod_data.clock.mask; 133 mask = VVAR(vsyscall_gtod_data).clock.mask;
140 mult = __vsyscall_gtod_data.clock.mult; 134 mult = VVAR(vsyscall_gtod_data).clock.mult;
141 shift = __vsyscall_gtod_data.clock.shift; 135 shift = VVAR(vsyscall_gtod_data).clock.shift;
142 136
143 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; 137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
144 nsec = __vsyscall_gtod_data.wall_time_nsec; 138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
145 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
146 140
147 /* calculate interval: */ 141 /* calculate interval: */
148 cycle_delta = (now - base) & mask; 142 cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
171{ 165{
172 unsigned seq; 166 unsigned seq;
173 time_t result; 167 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
175 return time_syscall(t); 169 return time_syscall(t);
176 170
177 do { 171 do {
178 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
179 173
180 result = __vsyscall_gtod_data.wall_time_sec; 174 result = VVAR(vsyscall_gtod_data).wall_time_sec;
181 175
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
183 177
184 if (t) 178 if (t)
185 *t = result; 179 *t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
208 We do this here because otherwise user space would do it on 202 We do this here because otherwise user space would do it on
209 its own in a likely inferior way (no access to jiffies). 203 its own in a likely inferior way (no access to jiffies).
210 If you don't like it pass NULL. */ 204 If you don't like it pass NULL. */
211 if (tcache && tcache->blob[0] == (j = __jiffies)) { 205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
212 p = tcache->blob[1]; 206 p = tcache->blob[1];
213 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
214 /* Load per CPU data from RDTSCP */ 208 /* Load per CPU data from RDTSCP */
215 native_read_tscp(&p); 209 native_read_tscp(&p);
216 } else { 210 } else {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128b..6f164bd5e14d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
35struct x86_init_ops x86_init __initdata = { 35struct x86_init_ops x86_init __initdata = {
36 36
37 .resources = { 37 .resources = {
38 .probe_roms = x86_init_noop, 38 .probe_roms = probe_roms,
39 .reserve_resources = reserve_standard_io_resources, 39 .reserve_resources = reserve_standard_io_resources,
40 .memory_setup = default_machine_specific_memory_setup, 40 .memory_setup = default_machine_specific_memory_setup,
41 }, 41 },
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
61 .banner = default_banner, 61 .banner = default_banner,
62 }, 62 },
63 63
64 .mapping = {
65 .pagetable_reserve = native_pagetable_reserve,
66 },
67
64 .paging = { 68 .paging = {
65 .pagetable_setup_start = native_pagetable_setup_start, 69 .pagetable_setup_start = native_pagetable_setup_start,
66 .pagetable_setup_done = native_pagetable_setup_done, 70 .pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0ad47b819a8b..d6e2477feb18 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -73,9 +73,14 @@
73#define MemAbs (1<<11) /* Memory operand is absolute displacement */ 73#define MemAbs (1<<11) /* Memory operand is absolute displacement */
74#define String (1<<12) /* String instruction (rep capable) */ 74#define String (1<<12) /* String instruction (rep capable) */
75#define Stack (1<<13) /* Stack instruction (push/pop) */ 75#define Stack (1<<13) /* Stack instruction (push/pop) */
76#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 77#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 78#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */
79#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */
80#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */
81#define Sse (1<<17) /* SSE Vector instruction */
78/* Misc flags */ 82/* Misc flags */
83#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */ 84#define VendorSpecific (1<<22) /* Vendor specific instruction */
80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 85#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 86#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
@@ -102,11 +107,14 @@
102 107
103struct opcode { 108struct opcode {
104 u32 flags; 109 u32 flags;
110 u8 intercept;
105 union { 111 union {
106 int (*execute)(struct x86_emulate_ctxt *ctxt); 112 int (*execute)(struct x86_emulate_ctxt *ctxt);
107 struct opcode *group; 113 struct opcode *group;
108 struct group_dual *gdual; 114 struct group_dual *gdual;
115 struct gprefix *gprefix;
109 } u; 116 } u;
117 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
110}; 118};
111 119
112struct group_dual { 120struct group_dual {
@@ -114,6 +122,13 @@ struct group_dual {
114 struct opcode mod3[8]; 122 struct opcode mod3[8];
115}; 123};
116 124
125struct gprefix {
126 struct opcode pfx_no;
127 struct opcode pfx_66;
128 struct opcode pfx_f2;
129 struct opcode pfx_f3;
130};
131
117/* EFLAGS bit definitions. */ 132/* EFLAGS bit definitions. */
118#define EFLG_ID (1<<21) 133#define EFLG_ID (1<<21)
119#define EFLG_VIP (1<<20) 134#define EFLG_VIP (1<<20)
@@ -248,42 +263,42 @@ struct group_dual {
248 "w", "r", _LO32, "r", "", "r") 263 "w", "r", _LO32, "r", "", "r")
249 264
250/* Instruction has three operands and one operand is stored in ECX register */ 265/* Instruction has three operands and one operand is stored in ECX register */
251#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 266#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
252 do { \ 267 do { \
253 unsigned long _tmp; \ 268 unsigned long _tmp; \
254 _type _clv = (_cl).val; \ 269 _type _clv = (_cl).val; \
255 _type _srcv = (_src).val; \ 270 _type _srcv = (_src).val; \
256 _type _dstv = (_dst).val; \ 271 _type _dstv = (_dst).val; \
257 \ 272 \
258 __asm__ __volatile__ ( \ 273 __asm__ __volatile__ ( \
259 _PRE_EFLAGS("0", "5", "2") \ 274 _PRE_EFLAGS("0", "5", "2") \
260 _op _suffix " %4,%1 \n" \ 275 _op _suffix " %4,%1 \n" \
261 _POST_EFLAGS("0", "5", "2") \ 276 _POST_EFLAGS("0", "5", "2") \
262 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 277 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
263 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 278 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
264 ); \ 279 ); \
265 \ 280 \
266 (_cl).val = (unsigned long) _clv; \ 281 (_cl).val = (unsigned long) _clv; \
267 (_src).val = (unsigned long) _srcv; \ 282 (_src).val = (unsigned long) _srcv; \
268 (_dst).val = (unsigned long) _dstv; \ 283 (_dst).val = (unsigned long) _dstv; \
269 } while (0) 284 } while (0)
270 285
271#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 286#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
272 do { \ 287 do { \
273 switch ((_dst).bytes) { \ 288 switch ((_dst).bytes) { \
274 case 2: \ 289 case 2: \
275 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 290 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
276 "w", unsigned short); \ 291 "w", unsigned short); \
277 break; \ 292 break; \
278 case 4: \ 293 case 4: \
279 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 294 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
280 "l", unsigned int); \ 295 "l", unsigned int); \
281 break; \ 296 break; \
282 case 8: \ 297 case 8: \
283 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 298 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
284 "q", unsigned long)); \ 299 "q", unsigned long)); \
285 break; \ 300 break; \
286 } \ 301 } \
287 } while (0) 302 } while (0)
288 303
289#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 304#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -346,13 +361,25 @@ struct group_dual {
346 } while (0) 361 } while (0)
347 362
348/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ 363/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
349#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ 364#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
350 do { \ 365 do { \
351 switch((_src).bytes) { \ 366 switch((_src).bytes) { \
352 case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ 367 case 1: \
353 case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ 368 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
354 case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ 369 _eflags, "b"); \
355 case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ 370 break; \
371 case 2: \
372 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
373 _eflags, "w"); \
374 break; \
375 case 4: \
376 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
377 _eflags, "l"); \
378 break; \
379 case 8: \
380 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
381 _eflags, "q")); \
382 break; \
356 } \ 383 } \
357 } while (0) 384 } while (0)
358 385
@@ -388,13 +415,33 @@ struct group_dual {
388 (_type)_x; \ 415 (_type)_x; \
389}) 416})
390 417
391#define insn_fetch_arr(_arr, _size, _eip) \ 418#define insn_fetch_arr(_arr, _size, _eip) \
392({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ 419({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
393 if (rc != X86EMUL_CONTINUE) \ 420 if (rc != X86EMUL_CONTINUE) \
394 goto done; \ 421 goto done; \
395 (_eip) += (_size); \ 422 (_eip) += (_size); \
396}) 423})
397 424
425static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
426 enum x86_intercept intercept,
427 enum x86_intercept_stage stage)
428{
429 struct x86_instruction_info info = {
430 .intercept = intercept,
431 .rep_prefix = ctxt->decode.rep_prefix,
432 .modrm_mod = ctxt->decode.modrm_mod,
433 .modrm_reg = ctxt->decode.modrm_reg,
434 .modrm_rm = ctxt->decode.modrm_rm,
435 .src_val = ctxt->decode.src.val64,
436 .src_bytes = ctxt->decode.src.bytes,
437 .dst_bytes = ctxt->decode.dst.bytes,
438 .ad_bytes = ctxt->decode.ad_bytes,
439 .next_rip = ctxt->eip,
440 };
441
442 return ctxt->ops->intercept(ctxt, &info, stage);
443}
444
398static inline unsigned long ad_mask(struct decode_cache *c) 445static inline unsigned long ad_mask(struct decode_cache *c)
399{ 446{
400 return (1UL << (c->ad_bytes << 3)) - 1; 447 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -430,6 +477,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
430 register_address_increment(c, &c->eip, rel); 477 register_address_increment(c, &c->eip, rel);
431} 478}
432 479
480static u32 desc_limit_scaled(struct desc_struct *desc)
481{
482 u32 limit = get_desc_limit(desc);
483
484 return desc->g ? (limit << 12) | 0xfff : limit;
485}
486
433static void set_seg_override(struct decode_cache *c, int seg) 487static void set_seg_override(struct decode_cache *c, int seg)
434{ 488{
435 c->has_seg_override = true; 489 c->has_seg_override = true;
@@ -442,11 +496,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
442 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 496 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
443 return 0; 497 return 0;
444 498
445 return ops->get_cached_segment_base(seg, ctxt->vcpu); 499 return ops->get_cached_segment_base(ctxt, seg);
446} 500}
447 501
448static unsigned seg_override(struct x86_emulate_ctxt *ctxt, 502static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
449 struct x86_emulate_ops *ops,
450 struct decode_cache *c) 503 struct decode_cache *c)
451{ 504{
452 if (!c->has_seg_override) 505 if (!c->has_seg_override)
@@ -455,18 +508,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
455 return c->seg_override; 508 return c->seg_override;
456} 509}
457 510
458static ulong linear(struct x86_emulate_ctxt *ctxt,
459 struct segmented_address addr)
460{
461 struct decode_cache *c = &ctxt->decode;
462 ulong la;
463
464 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
465 if (c->ad_bytes != 8)
466 la &= (u32)-1;
467 return la;
468}
469
470static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 511static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
471 u32 error, bool valid) 512 u32 error, bool valid)
472{ 513{
@@ -476,11 +517,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
476 return X86EMUL_PROPAGATE_FAULT; 517 return X86EMUL_PROPAGATE_FAULT;
477} 518}
478 519
520static int emulate_db(struct x86_emulate_ctxt *ctxt)
521{
522 return emulate_exception(ctxt, DB_VECTOR, 0, false);
523}
524
479static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 525static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
480{ 526{
481 return emulate_exception(ctxt, GP_VECTOR, err, true); 527 return emulate_exception(ctxt, GP_VECTOR, err, true);
482} 528}
483 529
530static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
531{
532 return emulate_exception(ctxt, SS_VECTOR, err, true);
533}
534
484static int emulate_ud(struct x86_emulate_ctxt *ctxt) 535static int emulate_ud(struct x86_emulate_ctxt *ctxt)
485{ 536{
486 return emulate_exception(ctxt, UD_VECTOR, 0, false); 537 return emulate_exception(ctxt, UD_VECTOR, 0, false);
@@ -496,6 +547,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt)
496 return emulate_exception(ctxt, DE_VECTOR, 0, false); 547 return emulate_exception(ctxt, DE_VECTOR, 0, false);
497} 548}
498 549
550static int emulate_nm(struct x86_emulate_ctxt *ctxt)
551{
552 return emulate_exception(ctxt, NM_VECTOR, 0, false);
553}
554
555static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
556{
557 u16 selector;
558 struct desc_struct desc;
559
560 ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
561 return selector;
562}
563
564static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
565 unsigned seg)
566{
567 u16 dummy;
568 u32 base3;
569 struct desc_struct desc;
570
571 ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
572 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
573}
574
575static int __linearize(struct x86_emulate_ctxt *ctxt,
576 struct segmented_address addr,
577 unsigned size, bool write, bool fetch,
578 ulong *linear)
579{
580 struct decode_cache *c = &ctxt->decode;
581 struct desc_struct desc;
582 bool usable;
583 ulong la;
584 u32 lim;
585 u16 sel;
586 unsigned cpl, rpl;
587
588 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
589 switch (ctxt->mode) {
590 case X86EMUL_MODE_REAL:
591 break;
592 case X86EMUL_MODE_PROT64:
593 if (((signed long)la << 16) >> 16 != la)
594 return emulate_gp(ctxt, 0);
595 break;
596 default:
597 usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
598 addr.seg);
599 if (!usable)
600 goto bad;
601 /* code segment or read-only data segment */
602 if (((desc.type & 8) || !(desc.type & 2)) && write)
603 goto bad;
604 /* unreadable code segment */
605 if (!fetch && (desc.type & 8) && !(desc.type & 2))
606 goto bad;
607 lim = desc_limit_scaled(&desc);
608 if ((desc.type & 8) || !(desc.type & 4)) {
609 /* expand-up segment */
610 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
611 goto bad;
612 } else {
613 /* exapand-down segment */
614 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
615 goto bad;
616 lim = desc.d ? 0xffffffff : 0xffff;
617 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
618 goto bad;
619 }
620 cpl = ctxt->ops->cpl(ctxt);
621 rpl = sel & 3;
622 cpl = max(cpl, rpl);
623 if (!(desc.type & 8)) {
624 /* data segment */
625 if (cpl > desc.dpl)
626 goto bad;
627 } else if ((desc.type & 8) && !(desc.type & 4)) {
628 /* nonconforming code segment */
629 if (cpl != desc.dpl)
630 goto bad;
631 } else if ((desc.type & 8) && (desc.type & 4)) {
632 /* conforming code segment */
633 if (cpl < desc.dpl)
634 goto bad;
635 }
636 break;
637 }
638 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
639 la &= (u32)-1;
640 *linear = la;
641 return X86EMUL_CONTINUE;
642bad:
643 if (addr.seg == VCPU_SREG_SS)
644 return emulate_ss(ctxt, addr.seg);
645 else
646 return emulate_gp(ctxt, addr.seg);
647}
648
649static int linearize(struct x86_emulate_ctxt *ctxt,
650 struct segmented_address addr,
651 unsigned size, bool write,
652 ulong *linear)
653{
654 return __linearize(ctxt, addr, size, write, false, linear);
655}
656
657
658static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
659 struct segmented_address addr,
660 void *data,
661 unsigned size)
662{
663 int rc;
664 ulong linear;
665
666 rc = linearize(ctxt, addr, size, false, &linear);
667 if (rc != X86EMUL_CONTINUE)
668 return rc;
669 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
670}
671
499static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 672static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
500 struct x86_emulate_ops *ops, 673 struct x86_emulate_ops *ops,
501 unsigned long eip, u8 *dest) 674 unsigned long eip, u8 *dest)
@@ -505,10 +678,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
505 int size, cur_size; 678 int size, cur_size;
506 679
507 if (eip == fc->end) { 680 if (eip == fc->end) {
681 unsigned long linear;
682 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
508 cur_size = fc->end - fc->start; 683 cur_size = fc->end - fc->start;
509 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 684 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
510 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 685 rc = __linearize(ctxt, addr, size, false, true, &linear);
511 size, ctxt->vcpu, &ctxt->exception); 686 if (rc != X86EMUL_CONTINUE)
687 return rc;
688 rc = ops->fetch(ctxt, linear, fc->data + cur_size,
689 size, &ctxt->exception);
512 if (rc != X86EMUL_CONTINUE) 690 if (rc != X86EMUL_CONTINUE)
513 return rc; 691 return rc;
514 fc->end += size; 692 fc->end += size;
@@ -551,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
551} 729}
552 730
553static int read_descriptor(struct x86_emulate_ctxt *ctxt, 731static int read_descriptor(struct x86_emulate_ctxt *ctxt,
554 struct x86_emulate_ops *ops,
555 struct segmented_address addr, 732 struct segmented_address addr,
556 u16 *size, unsigned long *address, int op_bytes) 733 u16 *size, unsigned long *address, int op_bytes)
557{ 734{
@@ -560,13 +737,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
560 if (op_bytes == 2) 737 if (op_bytes == 2)
561 op_bytes = 3; 738 op_bytes = 3;
562 *address = 0; 739 *address = 0;
563 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, 740 rc = segmented_read_std(ctxt, addr, size, 2);
564 ctxt->vcpu, &ctxt->exception);
565 if (rc != X86EMUL_CONTINUE) 741 if (rc != X86EMUL_CONTINUE)
566 return rc; 742 return rc;
567 addr.ea += 2; 743 addr.ea += 2;
568 rc = ops->read_std(linear(ctxt, addr), address, op_bytes, 744 rc = segmented_read_std(ctxt, addr, address, op_bytes);
569 ctxt->vcpu, &ctxt->exception);
570 return rc; 745 return rc;
571} 746}
572 747
@@ -623,7 +798,63 @@ static void fetch_register_operand(struct operand *op)
623 } 798 }
624} 799}
625 800
626static void decode_register_operand(struct operand *op, 801static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
802{
803 ctxt->ops->get_fpu(ctxt);
804 switch (reg) {
805 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
806 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
807 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
808 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
809 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
810 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
811 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
812 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
813#ifdef CONFIG_X86_64
814 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
815 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
816 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
817 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
818 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
819 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
820 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
821 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
822#endif
823 default: BUG();
824 }
825 ctxt->ops->put_fpu(ctxt);
826}
827
828static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
829 int reg)
830{
831 ctxt->ops->get_fpu(ctxt);
832 switch (reg) {
833 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
834 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
835 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
836 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
837 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
838 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
839 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
840 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
841#ifdef CONFIG_X86_64
842 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
843 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
844 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
845 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
846 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
847 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
848 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
849 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
850#endif
851 default: BUG();
852 }
853 ctxt->ops->put_fpu(ctxt);
854}
855
856static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
857 struct operand *op,
627 struct decode_cache *c, 858 struct decode_cache *c,
628 int inhibit_bytereg) 859 int inhibit_bytereg)
629{ 860{
@@ -632,6 +863,15 @@ static void decode_register_operand(struct operand *op,
632 863
633 if (!(c->d & ModRM)) 864 if (!(c->d & ModRM))
634 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 865 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
866
867 if (c->d & Sse) {
868 op->type = OP_XMM;
869 op->bytes = 16;
870 op->addr.xmm = reg;
871 read_sse_reg(ctxt, &op->vec_val, reg);
872 return;
873 }
874
635 op->type = OP_REG; 875 op->type = OP_REG;
636 if ((c->d & ByteOp) && !inhibit_bytereg) { 876 if ((c->d & ByteOp) && !inhibit_bytereg) {
637 op->addr.reg = decode_register(reg, c->regs, highbyte_regs); 877 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
@@ -671,6 +911,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
671 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 911 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
672 op->addr.reg = decode_register(c->modrm_rm, 912 op->addr.reg = decode_register(c->modrm_rm,
673 c->regs, c->d & ByteOp); 913 c->regs, c->d & ByteOp);
914 if (c->d & Sse) {
915 op->type = OP_XMM;
916 op->bytes = 16;
917 op->addr.xmm = c->modrm_rm;
918 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
919 return rc;
920 }
674 fetch_register_operand(op); 921 fetch_register_operand(op);
675 return rc; 922 return rc;
676 } 923 }
@@ -819,8 +1066,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
819 if (mc->pos < mc->end) 1066 if (mc->pos < mc->end)
820 goto read_cached; 1067 goto read_cached;
821 1068
822 rc = ops->read_emulated(addr, mc->data + mc->end, n, 1069 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
823 &ctxt->exception, ctxt->vcpu); 1070 &ctxt->exception);
824 if (rc != X86EMUL_CONTINUE) 1071 if (rc != X86EMUL_CONTINUE)
825 return rc; 1072 return rc;
826 mc->end += n; 1073 mc->end += n;
@@ -834,6 +1081,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
834 return X86EMUL_CONTINUE; 1081 return X86EMUL_CONTINUE;
835} 1082}
836 1083
1084static int segmented_read(struct x86_emulate_ctxt *ctxt,
1085 struct segmented_address addr,
1086 void *data,
1087 unsigned size)
1088{
1089 int rc;
1090 ulong linear;
1091
1092 rc = linearize(ctxt, addr, size, false, &linear);
1093 if (rc != X86EMUL_CONTINUE)
1094 return rc;
1095 return read_emulated(ctxt, ctxt->ops, linear, data, size);
1096}
1097
1098static int segmented_write(struct x86_emulate_ctxt *ctxt,
1099 struct segmented_address addr,
1100 const void *data,
1101 unsigned size)
1102{
1103 int rc;
1104 ulong linear;
1105
1106 rc = linearize(ctxt, addr, size, true, &linear);
1107 if (rc != X86EMUL_CONTINUE)
1108 return rc;
1109 return ctxt->ops->write_emulated(ctxt, linear, data, size,
1110 &ctxt->exception);
1111}
1112
1113static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1114 struct segmented_address addr,
1115 const void *orig_data, const void *data,
1116 unsigned size)
1117{
1118 int rc;
1119 ulong linear;
1120
1121 rc = linearize(ctxt, addr, size, true, &linear);
1122 if (rc != X86EMUL_CONTINUE)
1123 return rc;
1124 return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
1125 size, &ctxt->exception);
1126}
1127
837static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1128static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
838 struct x86_emulate_ops *ops, 1129 struct x86_emulate_ops *ops,
839 unsigned int size, unsigned short port, 1130 unsigned int size, unsigned short port,
@@ -854,7 +1145,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
854 if (n == 0) 1145 if (n == 0)
855 n = 1; 1146 n = 1;
856 rc->pos = rc->end = 0; 1147 rc->pos = rc->end = 0;
857 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) 1148 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
858 return 0; 1149 return 0;
859 rc->end = n * size; 1150 rc->end = n * size;
860 } 1151 }
@@ -864,28 +1155,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
864 return 1; 1155 return 1;
865} 1156}
866 1157
867static u32 desc_limit_scaled(struct desc_struct *desc)
868{
869 u32 limit = get_desc_limit(desc);
870
871 return desc->g ? (limit << 12) | 0xfff : limit;
872}
873
874static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1158static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
875 struct x86_emulate_ops *ops, 1159 struct x86_emulate_ops *ops,
876 u16 selector, struct desc_ptr *dt) 1160 u16 selector, struct desc_ptr *dt)
877{ 1161{
878 if (selector & 1 << 2) { 1162 if (selector & 1 << 2) {
879 struct desc_struct desc; 1163 struct desc_struct desc;
1164 u16 sel;
1165
880 memset (dt, 0, sizeof *dt); 1166 memset (dt, 0, sizeof *dt);
881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, 1167 if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
882 ctxt->vcpu))
883 return; 1168 return;
884 1169
885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 1170 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
886 dt->address = get_desc_base(&desc); 1171 dt->address = get_desc_base(&desc);
887 } else 1172 } else
888 ops->get_gdt(dt, ctxt->vcpu); 1173 ops->get_gdt(ctxt, dt);
889} 1174}
890 1175
891/* allowed just for 8 bytes segments */ 1176/* allowed just for 8 bytes segments */
@@ -903,8 +1188,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
903 if (dt.size < index * 8 + 7) 1188 if (dt.size < index * 8 + 7)
904 return emulate_gp(ctxt, selector & 0xfffc); 1189 return emulate_gp(ctxt, selector & 0xfffc);
905 addr = dt.address + index * 8; 1190 addr = dt.address + index * 8;
906 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, 1191 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
907 &ctxt->exception);
908 1192
909 return ret; 1193 return ret;
910} 1194}
@@ -925,8 +1209,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
925 return emulate_gp(ctxt, selector & 0xfffc); 1209 return emulate_gp(ctxt, selector & 0xfffc);
926 1210
927 addr = dt.address + index * 8; 1211 addr = dt.address + index * 8;
928 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, 1212 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
929 &ctxt->exception);
930 1213
931 return ret; 1214 return ret;
932} 1215}
@@ -986,7 +1269,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
986 1269
987 rpl = selector & 3; 1270 rpl = selector & 3;
988 dpl = seg_desc.dpl; 1271 dpl = seg_desc.dpl;
989 cpl = ops->cpl(ctxt->vcpu); 1272 cpl = ops->cpl(ctxt);
990 1273
991 switch (seg) { 1274 switch (seg) {
992 case VCPU_SREG_SS: 1275 case VCPU_SREG_SS:
@@ -1042,8 +1325,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1042 return ret; 1325 return ret;
1043 } 1326 }
1044load: 1327load:
1045 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1328 ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1047 return X86EMUL_CONTINUE; 1329 return X86EMUL_CONTINUE;
1048exception: 1330exception:
1049 emulate_exception(ctxt, err_vec, err_code, true); 1331 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1069,8 +1351,7 @@ static void write_register_operand(struct operand *op)
1069 } 1351 }
1070} 1352}
1071 1353
1072static inline int writeback(struct x86_emulate_ctxt *ctxt, 1354static int writeback(struct x86_emulate_ctxt *ctxt)
1073 struct x86_emulate_ops *ops)
1074{ 1355{
1075 int rc; 1356 int rc;
1076 struct decode_cache *c = &ctxt->decode; 1357 struct decode_cache *c = &ctxt->decode;
@@ -1081,23 +1362,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1081 break; 1362 break;
1082 case OP_MEM: 1363 case OP_MEM:
1083 if (c->lock_prefix) 1364 if (c->lock_prefix)
1084 rc = ops->cmpxchg_emulated( 1365 rc = segmented_cmpxchg(ctxt,
1085 linear(ctxt, c->dst.addr.mem), 1366 c->dst.addr.mem,
1086 &c->dst.orig_val, 1367 &c->dst.orig_val,
1087 &c->dst.val, 1368 &c->dst.val,
1088 c->dst.bytes, 1369 c->dst.bytes);
1089 &ctxt->exception,
1090 ctxt->vcpu);
1091 else 1370 else
1092 rc = ops->write_emulated( 1371 rc = segmented_write(ctxt,
1093 linear(ctxt, c->dst.addr.mem), 1372 c->dst.addr.mem,
1094 &c->dst.val, 1373 &c->dst.val,
1095 c->dst.bytes, 1374 c->dst.bytes);
1096 &ctxt->exception,
1097 ctxt->vcpu);
1098 if (rc != X86EMUL_CONTINUE) 1375 if (rc != X86EMUL_CONTINUE)
1099 return rc; 1376 return rc;
1100 break; 1377 break;
1378 case OP_XMM:
1379 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
1380 break;
1101 case OP_NONE: 1381 case OP_NONE:
1102 /* no writeback */ 1382 /* no writeback */
1103 break; 1383 break;
@@ -1107,21 +1387,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1107 return X86EMUL_CONTINUE; 1387 return X86EMUL_CONTINUE;
1108} 1388}
1109 1389
1110static inline void emulate_push(struct x86_emulate_ctxt *ctxt, 1390static int em_push(struct x86_emulate_ctxt *ctxt)
1111 struct x86_emulate_ops *ops)
1112{ 1391{
1113 struct decode_cache *c = &ctxt->decode; 1392 struct decode_cache *c = &ctxt->decode;
1393 struct segmented_address addr;
1114 1394
1115 c->dst.type = OP_MEM;
1116 c->dst.bytes = c->op_bytes;
1117 c->dst.val = c->src.val;
1118 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1395 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1119 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1396 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1120 c->dst.addr.mem.seg = VCPU_SREG_SS; 1397 addr.seg = VCPU_SREG_SS;
1398
1399 /* Disable writeback. */
1400 c->dst.type = OP_NONE;
1401 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
1121} 1402}
1122 1403
1123static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1404static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1124 struct x86_emulate_ops *ops,
1125 void *dest, int len) 1405 void *dest, int len)
1126{ 1406{
1127 struct decode_cache *c = &ctxt->decode; 1407 struct decode_cache *c = &ctxt->decode;
@@ -1130,7 +1410,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1130 1410
1131 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1411 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1132 addr.seg = VCPU_SREG_SS; 1412 addr.seg = VCPU_SREG_SS;
1133 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); 1413 rc = segmented_read(ctxt, addr, dest, len);
1134 if (rc != X86EMUL_CONTINUE) 1414 if (rc != X86EMUL_CONTINUE)
1135 return rc; 1415 return rc;
1136 1416
@@ -1138,6 +1418,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1138 return rc; 1418 return rc;
1139} 1419}
1140 1420
1421static int em_pop(struct x86_emulate_ctxt *ctxt)
1422{
1423 struct decode_cache *c = &ctxt->decode;
1424
1425 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1426}
1427
1141static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1428static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1142 struct x86_emulate_ops *ops, 1429 struct x86_emulate_ops *ops,
1143 void *dest, int len) 1430 void *dest, int len)
@@ -1145,9 +1432,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1145 int rc; 1432 int rc;
1146 unsigned long val, change_mask; 1433 unsigned long val, change_mask;
1147 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1434 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1148 int cpl = ops->cpl(ctxt->vcpu); 1435 int cpl = ops->cpl(ctxt);
1149 1436
1150 rc = emulate_pop(ctxt, ops, &val, len); 1437 rc = emulate_pop(ctxt, &val, len);
1151 if (rc != X86EMUL_CONTINUE) 1438 if (rc != X86EMUL_CONTINUE)
1152 return rc; 1439 return rc;
1153 1440
@@ -1179,14 +1466,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1179 return rc; 1466 return rc;
1180} 1467}
1181 1468
1182static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1469static int em_popf(struct x86_emulate_ctxt *ctxt)
1183 struct x86_emulate_ops *ops, int seg)
1184{ 1470{
1185 struct decode_cache *c = &ctxt->decode; 1471 struct decode_cache *c = &ctxt->decode;
1186 1472
1187 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); 1473 c->dst.type = OP_REG;
1474 c->dst.addr.reg = &ctxt->eflags;
1475 c->dst.bytes = c->op_bytes;
1476 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1477}
1188 1478
1189 emulate_push(ctxt, ops); 1479static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1480 struct x86_emulate_ops *ops, int seg)
1481{
1482 struct decode_cache *c = &ctxt->decode;
1483
1484 c->src.val = get_segment_selector(ctxt, seg);
1485
1486 return em_push(ctxt);
1190} 1487}
1191 1488
1192static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1489static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1196,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1196 unsigned long selector; 1493 unsigned long selector;
1197 int rc; 1494 int rc;
1198 1495
1199 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1496 rc = emulate_pop(ctxt, &selector, c->op_bytes);
1200 if (rc != X86EMUL_CONTINUE) 1497 if (rc != X86EMUL_CONTINUE)
1201 return rc; 1498 return rc;
1202 1499
@@ -1204,8 +1501,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1204 return rc; 1501 return rc;
1205} 1502}
1206 1503
1207static int emulate_pusha(struct x86_emulate_ctxt *ctxt, 1504static int em_pusha(struct x86_emulate_ctxt *ctxt)
1208 struct x86_emulate_ops *ops)
1209{ 1505{
1210 struct decode_cache *c = &ctxt->decode; 1506 struct decode_cache *c = &ctxt->decode;
1211 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1507 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1216,23 +1512,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1216 (reg == VCPU_REGS_RSP) ? 1512 (reg == VCPU_REGS_RSP) ?
1217 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1513 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1218 1514
1219 emulate_push(ctxt, ops); 1515 rc = em_push(ctxt);
1220
1221 rc = writeback(ctxt, ops);
1222 if (rc != X86EMUL_CONTINUE) 1516 if (rc != X86EMUL_CONTINUE)
1223 return rc; 1517 return rc;
1224 1518
1225 ++reg; 1519 ++reg;
1226 } 1520 }
1227 1521
1228 /* Disable writeback. */
1229 c->dst.type = OP_NONE;
1230
1231 return rc; 1522 return rc;
1232} 1523}
1233 1524
1234static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1525static int em_pushf(struct x86_emulate_ctxt *ctxt)
1235 struct x86_emulate_ops *ops) 1526{
1527 struct decode_cache *c = &ctxt->decode;
1528
1529 c->src.val = (unsigned long)ctxt->eflags;
1530 return em_push(ctxt);
1531}
1532
1533static int em_popa(struct x86_emulate_ctxt *ctxt)
1236{ 1534{
1237 struct decode_cache *c = &ctxt->decode; 1535 struct decode_cache *c = &ctxt->decode;
1238 int rc = X86EMUL_CONTINUE; 1536 int rc = X86EMUL_CONTINUE;
@@ -1245,7 +1543,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1245 --reg; 1543 --reg;
1246 } 1544 }
1247 1545
1248 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1546 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
1249 if (rc != X86EMUL_CONTINUE) 1547 if (rc != X86EMUL_CONTINUE)
1250 break; 1548 break;
1251 --reg; 1549 --reg;
@@ -1265,37 +1563,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1265 1563
1266 /* TODO: Add limit checks */ 1564 /* TODO: Add limit checks */
1267 c->src.val = ctxt->eflags; 1565 c->src.val = ctxt->eflags;
1268 emulate_push(ctxt, ops); 1566 rc = em_push(ctxt);
1269 rc = writeback(ctxt, ops);
1270 if (rc != X86EMUL_CONTINUE) 1567 if (rc != X86EMUL_CONTINUE)
1271 return rc; 1568 return rc;
1272 1569
1273 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1570 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1274 1571
1275 c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 1572 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1276 emulate_push(ctxt, ops); 1573 rc = em_push(ctxt);
1277 rc = writeback(ctxt, ops);
1278 if (rc != X86EMUL_CONTINUE) 1574 if (rc != X86EMUL_CONTINUE)
1279 return rc; 1575 return rc;
1280 1576
1281 c->src.val = c->eip; 1577 c->src.val = c->eip;
1282 emulate_push(ctxt, ops); 1578 rc = em_push(ctxt);
1283 rc = writeback(ctxt, ops);
1284 if (rc != X86EMUL_CONTINUE) 1579 if (rc != X86EMUL_CONTINUE)
1285 return rc; 1580 return rc;
1286 1581
1287 c->dst.type = OP_NONE; 1582 ops->get_idt(ctxt, &dt);
1288
1289 ops->get_idt(&dt, ctxt->vcpu);
1290 1583
1291 eip_addr = dt.address + (irq << 2); 1584 eip_addr = dt.address + (irq << 2);
1292 cs_addr = dt.address + (irq << 2) + 2; 1585 cs_addr = dt.address + (irq << 2) + 2;
1293 1586
1294 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); 1587 rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
1295 if (rc != X86EMUL_CONTINUE) 1588 if (rc != X86EMUL_CONTINUE)
1296 return rc; 1589 return rc;
1297 1590
1298 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); 1591 rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
1299 if (rc != X86EMUL_CONTINUE) 1592 if (rc != X86EMUL_CONTINUE)
1300 return rc; 1593 return rc;
1301 1594
@@ -1339,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1339 1632
1340 /* TODO: Add stack limit check */ 1633 /* TODO: Add stack limit check */
1341 1634
1342 rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); 1635 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
1343 1636
1344 if (rc != X86EMUL_CONTINUE) 1637 if (rc != X86EMUL_CONTINUE)
1345 return rc; 1638 return rc;
@@ -1347,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1347 if (temp_eip & ~0xffff) 1640 if (temp_eip & ~0xffff)
1348 return emulate_gp(ctxt, 0); 1641 return emulate_gp(ctxt, 0);
1349 1642
1350 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1643 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1351 1644
1352 if (rc != X86EMUL_CONTINUE) 1645 if (rc != X86EMUL_CONTINUE)
1353 return rc; 1646 return rc;
1354 1647
1355 rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); 1648 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
1356 1649
1357 if (rc != X86EMUL_CONTINUE) 1650 if (rc != X86EMUL_CONTINUE)
1358 return rc; 1651 return rc;
@@ -1394,15 +1687,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1394 } 1687 }
1395} 1688}
1396 1689
1397static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1690static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1398 struct x86_emulate_ops *ops) 1691{
1692 struct decode_cache *c = &ctxt->decode;
1693 int rc;
1694 unsigned short sel;
1695
1696 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1697
1698 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
1699 if (rc != X86EMUL_CONTINUE)
1700 return rc;
1701
1702 c->eip = 0;
1703 memcpy(&c->eip, c->src.valptr, c->op_bytes);
1704 return X86EMUL_CONTINUE;
1705}
1706
1707static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1399{ 1708{
1400 struct decode_cache *c = &ctxt->decode; 1709 struct decode_cache *c = &ctxt->decode;
1401 1710
1402 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1711 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1403} 1712}
1404 1713
1405static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1714static int em_grp2(struct x86_emulate_ctxt *ctxt)
1406{ 1715{
1407 struct decode_cache *c = &ctxt->decode; 1716 struct decode_cache *c = &ctxt->decode;
1408 switch (c->modrm_reg) { 1717 switch (c->modrm_reg) {
@@ -1429,10 +1738,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1429 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1738 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1430 break; 1739 break;
1431 } 1740 }
1741 return X86EMUL_CONTINUE;
1432} 1742}
1433 1743
1434static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, 1744static int em_grp3(struct x86_emulate_ctxt *ctxt)
1435 struct x86_emulate_ops *ops)
1436{ 1745{
1437 struct decode_cache *c = &ctxt->decode; 1746 struct decode_cache *c = &ctxt->decode;
1438 unsigned long *rax = &c->regs[VCPU_REGS_RAX]; 1747 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
@@ -1471,10 +1780,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1471 return X86EMUL_CONTINUE; 1780 return X86EMUL_CONTINUE;
1472} 1781}
1473 1782
1474static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1783static int em_grp45(struct x86_emulate_ctxt *ctxt)
1475 struct x86_emulate_ops *ops)
1476{ 1784{
1477 struct decode_cache *c = &ctxt->decode; 1785 struct decode_cache *c = &ctxt->decode;
1786 int rc = X86EMUL_CONTINUE;
1478 1787
1479 switch (c->modrm_reg) { 1788 switch (c->modrm_reg) {
1480 case 0: /* inc */ 1789 case 0: /* inc */
@@ -1488,21 +1797,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1488 old_eip = c->eip; 1797 old_eip = c->eip;
1489 c->eip = c->src.val; 1798 c->eip = c->src.val;
1490 c->src.val = old_eip; 1799 c->src.val = old_eip;
1491 emulate_push(ctxt, ops); 1800 rc = em_push(ctxt);
1492 break; 1801 break;
1493 } 1802 }
1494 case 4: /* jmp abs */ 1803 case 4: /* jmp abs */
1495 c->eip = c->src.val; 1804 c->eip = c->src.val;
1496 break; 1805 break;
1806 case 5: /* jmp far */
1807 rc = em_jmp_far(ctxt);
1808 break;
1497 case 6: /* push */ 1809 case 6: /* push */
1498 emulate_push(ctxt, ops); 1810 rc = em_push(ctxt);
1499 break; 1811 break;
1500 } 1812 }
1501 return X86EMUL_CONTINUE; 1813 return rc;
1502} 1814}
1503 1815
1504static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1816static int em_grp9(struct x86_emulate_ctxt *ctxt)
1505 struct x86_emulate_ops *ops)
1506{ 1817{
1507 struct decode_cache *c = &ctxt->decode; 1818 struct decode_cache *c = &ctxt->decode;
1508 u64 old = c->dst.orig_val64; 1819 u64 old = c->dst.orig_val64;
@@ -1528,12 +1839,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1528 int rc; 1839 int rc;
1529 unsigned long cs; 1840 unsigned long cs;
1530 1841
1531 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1842 rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
1532 if (rc != X86EMUL_CONTINUE) 1843 if (rc != X86EMUL_CONTINUE)
1533 return rc; 1844 return rc;
1534 if (c->op_bytes == 4) 1845 if (c->op_bytes == 4)
1535 c->eip = (u32)c->eip; 1846 c->eip = (u32)c->eip;
1536 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1847 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1537 if (rc != X86EMUL_CONTINUE) 1848 if (rc != X86EMUL_CONTINUE)
1538 return rc; 1849 return rc;
1539 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1850 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
@@ -1562,8 +1873,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1562 struct x86_emulate_ops *ops, struct desc_struct *cs, 1873 struct x86_emulate_ops *ops, struct desc_struct *cs,
1563 struct desc_struct *ss) 1874 struct desc_struct *ss)
1564{ 1875{
1876 u16 selector;
1877
1565 memset(cs, 0, sizeof(struct desc_struct)); 1878 memset(cs, 0, sizeof(struct desc_struct));
1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); 1879 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1567 memset(ss, 0, sizeof(struct desc_struct)); 1880 memset(ss, 0, sizeof(struct desc_struct));
1568 1881
1569 cs->l = 0; /* will be adjusted later */ 1882 cs->l = 0; /* will be adjusted later */
@@ -1593,44 +1906,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1593 struct desc_struct cs, ss; 1906 struct desc_struct cs, ss;
1594 u64 msr_data; 1907 u64 msr_data;
1595 u16 cs_sel, ss_sel; 1908 u16 cs_sel, ss_sel;
1909 u64 efer = 0;
1596 1910
1597 /* syscall is not available in real mode */ 1911 /* syscall is not available in real mode */
1598 if (ctxt->mode == X86EMUL_MODE_REAL || 1912 if (ctxt->mode == X86EMUL_MODE_REAL ||
1599 ctxt->mode == X86EMUL_MODE_VM86) 1913 ctxt->mode == X86EMUL_MODE_VM86)
1600 return emulate_ud(ctxt); 1914 return emulate_ud(ctxt);
1601 1915
1916 ops->get_msr(ctxt, MSR_EFER, &efer);
1602 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1917 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1603 1918
1604 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1919 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1605 msr_data >>= 32; 1920 msr_data >>= 32;
1606 cs_sel = (u16)(msr_data & 0xfffc); 1921 cs_sel = (u16)(msr_data & 0xfffc);
1607 ss_sel = (u16)(msr_data + 8); 1922 ss_sel = (u16)(msr_data + 8);
1608 1923
1609 if (is_long_mode(ctxt->vcpu)) { 1924 if (efer & EFER_LMA) {
1610 cs.d = 0; 1925 cs.d = 0;
1611 cs.l = 1; 1926 cs.l = 1;
1612 } 1927 }
1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 1928 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1929 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1617 1930
1618 c->regs[VCPU_REGS_RCX] = c->eip; 1931 c->regs[VCPU_REGS_RCX] = c->eip;
1619 if (is_long_mode(ctxt->vcpu)) { 1932 if (efer & EFER_LMA) {
1620#ifdef CONFIG_X86_64 1933#ifdef CONFIG_X86_64
1621 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1934 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1622 1935
1623 ops->get_msr(ctxt->vcpu, 1936 ops->get_msr(ctxt,
1624 ctxt->mode == X86EMUL_MODE_PROT64 ? 1937 ctxt->mode == X86EMUL_MODE_PROT64 ?
1625 MSR_LSTAR : MSR_CSTAR, &msr_data); 1938 MSR_LSTAR : MSR_CSTAR, &msr_data);
1626 c->eip = msr_data; 1939 c->eip = msr_data;
1627 1940
1628 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1941 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1629 ctxt->eflags &= ~(msr_data | EFLG_RF); 1942 ctxt->eflags &= ~(msr_data | EFLG_RF);
1630#endif 1943#endif
1631 } else { 1944 } else {
1632 /* legacy mode */ 1945 /* legacy mode */
1633 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1946 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1634 c->eip = (u32)msr_data; 1947 c->eip = (u32)msr_data;
1635 1948
1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1949 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1646,7 +1959,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1646 struct desc_struct cs, ss; 1959 struct desc_struct cs, ss;
1647 u64 msr_data; 1960 u64 msr_data;
1648 u16 cs_sel, ss_sel; 1961 u16 cs_sel, ss_sel;
1962 u64 efer = 0;
1649 1963
1964 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
1650 /* inject #GP if in real mode */ 1965 /* inject #GP if in real mode */
1651 if (ctxt->mode == X86EMUL_MODE_REAL) 1966 if (ctxt->mode == X86EMUL_MODE_REAL)
1652 return emulate_gp(ctxt, 0); 1967 return emulate_gp(ctxt, 0);
@@ -1659,7 +1974,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1659 1974
1660 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1975 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1661 1976
1662 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1977 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1663 switch (ctxt->mode) { 1978 switch (ctxt->mode) {
1664 case X86EMUL_MODE_PROT32: 1979 case X86EMUL_MODE_PROT32:
1665 if ((msr_data & 0xfffc) == 0x0) 1980 if ((msr_data & 0xfffc) == 0x0)
@@ -1676,21 +1991,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1676 cs_sel &= ~SELECTOR_RPL_MASK; 1991 cs_sel &= ~SELECTOR_RPL_MASK;
1677 ss_sel = cs_sel + 8; 1992 ss_sel = cs_sel + 8;
1678 ss_sel &= ~SELECTOR_RPL_MASK; 1993 ss_sel &= ~SELECTOR_RPL_MASK;
1679 if (ctxt->mode == X86EMUL_MODE_PROT64 1994 if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
1680 || is_long_mode(ctxt->vcpu)) {
1681 cs.d = 0; 1995 cs.d = 0;
1682 cs.l = 1; 1996 cs.l = 1;
1683 } 1997 }
1684 1998
1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 1999 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2000 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1689 2001
1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2002 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
1691 c->eip = msr_data; 2003 c->eip = msr_data;
1692 2004
1693 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2005 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
1694 c->regs[VCPU_REGS_RSP] = msr_data; 2006 c->regs[VCPU_REGS_RSP] = msr_data;
1695 2007
1696 return X86EMUL_CONTINUE; 2008 return X86EMUL_CONTINUE;
@@ -1719,7 +2031,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1719 2031
1720 cs.dpl = 3; 2032 cs.dpl = 3;
1721 ss.dpl = 3; 2033 ss.dpl = 3;
1722 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2034 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1723 switch (usermode) { 2035 switch (usermode) {
1724 case X86EMUL_MODE_PROT32: 2036 case X86EMUL_MODE_PROT32:
1725 cs_sel = (u16)(msr_data + 16); 2037 cs_sel = (u16)(msr_data + 16);
@@ -1739,10 +2051,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1739 cs_sel |= SELECTOR_RPL_MASK; 2051 cs_sel |= SELECTOR_RPL_MASK;
1740 ss_sel |= SELECTOR_RPL_MASK; 2052 ss_sel |= SELECTOR_RPL_MASK;
1741 2053
1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 2054 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2055 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1746 2056
1747 c->eip = c->regs[VCPU_REGS_RDX]; 2057 c->eip = c->regs[VCPU_REGS_RDX];
1748 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2058 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
@@ -1759,7 +2069,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
1759 if (ctxt->mode == X86EMUL_MODE_VM86) 2069 if (ctxt->mode == X86EMUL_MODE_VM86)
1760 return true; 2070 return true;
1761 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2071 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1762 return ops->cpl(ctxt->vcpu) > iopl; 2072 return ops->cpl(ctxt) > iopl;
1763} 2073}
1764 2074
1765static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2075static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1769,11 +2079,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1769 struct desc_struct tr_seg; 2079 struct desc_struct tr_seg;
1770 u32 base3; 2080 u32 base3;
1771 int r; 2081 int r;
1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; 2082 u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
1773 unsigned mask = (1 << len) - 1; 2083 unsigned mask = (1 << len) - 1;
1774 unsigned long base; 2084 unsigned long base;
1775 2085
1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); 2086 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
1777 if (!tr_seg.p) 2087 if (!tr_seg.p)
1778 return false; 2088 return false;
1779 if (desc_limit_scaled(&tr_seg) < 103) 2089 if (desc_limit_scaled(&tr_seg) < 103)
@@ -1782,13 +2092,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1782#ifdef CONFIG_X86_64 2092#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32; 2093 base |= ((u64)base3) << 32;
1784#endif 2094#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); 2095 r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
1786 if (r != X86EMUL_CONTINUE) 2096 if (r != X86EMUL_CONTINUE)
1787 return false; 2097 return false;
1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 2098 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1789 return false; 2099 return false;
1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, 2100 r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
1791 NULL);
1792 if (r != X86EMUL_CONTINUE) 2101 if (r != X86EMUL_CONTINUE)
1793 return false; 2102 return false;
1794 if ((perm >> bit_idx) & mask) 2103 if ((perm >> bit_idx) & mask)
@@ -1829,11 +2138,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
1829 tss->si = c->regs[VCPU_REGS_RSI]; 2138 tss->si = c->regs[VCPU_REGS_RSI];
1830 tss->di = c->regs[VCPU_REGS_RDI]; 2139 tss->di = c->regs[VCPU_REGS_RDI];
1831 2140
1832 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2141 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
1833 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2142 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
1834 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2143 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
1835 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2144 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
1836 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2145 tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
1837} 2146}
1838 2147
1839static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2148static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
@@ -1858,11 +2167,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
1858 * SDM says that segment selectors are loaded before segment 2167 * SDM says that segment selectors are loaded before segment
1859 * descriptors 2168 * descriptors
1860 */ 2169 */
1861 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); 2170 set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
1862 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2171 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
1863 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2172 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
1864 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2173 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
1865 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2174 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
1866 2175
1867 /* 2176 /*
1868 * Now load segment descriptors. If fault happenes at this stage 2177 * Now load segment descriptors. If fault happenes at this stage
@@ -1896,7 +2205,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1896 int ret; 2205 int ret;
1897 u32 new_tss_base = get_desc_base(new_desc); 2206 u32 new_tss_base = get_desc_base(new_desc);
1898 2207
1899 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2208 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
1900 &ctxt->exception); 2209 &ctxt->exception);
1901 if (ret != X86EMUL_CONTINUE) 2210 if (ret != X86EMUL_CONTINUE)
1902 /* FIXME: need to provide precise fault address */ 2211 /* FIXME: need to provide precise fault address */
@@ -1904,13 +2213,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1904 2213
1905 save_state_to_tss16(ctxt, ops, &tss_seg); 2214 save_state_to_tss16(ctxt, ops, &tss_seg);
1906 2215
1907 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2216 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
1908 &ctxt->exception); 2217 &ctxt->exception);
1909 if (ret != X86EMUL_CONTINUE) 2218 if (ret != X86EMUL_CONTINUE)
1910 /* FIXME: need to provide precise fault address */ 2219 /* FIXME: need to provide precise fault address */
1911 return ret; 2220 return ret;
1912 2221
1913 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2222 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
1914 &ctxt->exception); 2223 &ctxt->exception);
1915 if (ret != X86EMUL_CONTINUE) 2224 if (ret != X86EMUL_CONTINUE)
1916 /* FIXME: need to provide precise fault address */ 2225 /* FIXME: need to provide precise fault address */
@@ -1919,10 +2228,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1919 if (old_tss_sel != 0xffff) { 2228 if (old_tss_sel != 0xffff) {
1920 tss_seg.prev_task_link = old_tss_sel; 2229 tss_seg.prev_task_link = old_tss_sel;
1921 2230
1922 ret = ops->write_std(new_tss_base, 2231 ret = ops->write_std(ctxt, new_tss_base,
1923 &tss_seg.prev_task_link, 2232 &tss_seg.prev_task_link,
1924 sizeof tss_seg.prev_task_link, 2233 sizeof tss_seg.prev_task_link,
1925 ctxt->vcpu, &ctxt->exception); 2234 &ctxt->exception);
1926 if (ret != X86EMUL_CONTINUE) 2235 if (ret != X86EMUL_CONTINUE)
1927 /* FIXME: need to provide precise fault address */ 2236 /* FIXME: need to provide precise fault address */
1928 return ret; 2237 return ret;
@@ -1937,7 +2246,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
1937{ 2246{
1938 struct decode_cache *c = &ctxt->decode; 2247 struct decode_cache *c = &ctxt->decode;
1939 2248
1940 tss->cr3 = ops->get_cr(3, ctxt->vcpu); 2249 tss->cr3 = ops->get_cr(ctxt, 3);
1941 tss->eip = c->eip; 2250 tss->eip = c->eip;
1942 tss->eflags = ctxt->eflags; 2251 tss->eflags = ctxt->eflags;
1943 tss->eax = c->regs[VCPU_REGS_RAX]; 2252 tss->eax = c->regs[VCPU_REGS_RAX];
@@ -1949,13 +2258,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
1949 tss->esi = c->regs[VCPU_REGS_RSI]; 2258 tss->esi = c->regs[VCPU_REGS_RSI];
1950 tss->edi = c->regs[VCPU_REGS_RDI]; 2259 tss->edi = c->regs[VCPU_REGS_RDI];
1951 2260
1952 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2261 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
1953 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2262 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
1954 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2263 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
1955 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2264 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
1956 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); 2265 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
1957 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); 2266 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
1958 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2267 tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
1959} 2268}
1960 2269
1961static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2270static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
@@ -1965,7 +2274,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
1965 struct decode_cache *c = &ctxt->decode; 2274 struct decode_cache *c = &ctxt->decode;
1966 int ret; 2275 int ret;
1967 2276
1968 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) 2277 if (ops->set_cr(ctxt, 3, tss->cr3))
1969 return emulate_gp(ctxt, 0); 2278 return emulate_gp(ctxt, 0);
1970 c->eip = tss->eip; 2279 c->eip = tss->eip;
1971 ctxt->eflags = tss->eflags | 2; 2280 ctxt->eflags = tss->eflags | 2;
@@ -1982,13 +2291,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
1982 * SDM says that segment selectors are loaded before segment 2291 * SDM says that segment selectors are loaded before segment
1983 * descriptors 2292 * descriptors
1984 */ 2293 */
1985 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); 2294 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
1986 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2295 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
1987 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2296 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
1988 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2297 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
1989 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2298 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
1990 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); 2299 set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
1991 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); 2300 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
1992 2301
1993 /* 2302 /*
1994 * Now load segment descriptors. If fault happenes at this stage 2303 * Now load segment descriptors. If fault happenes at this stage
@@ -2028,7 +2337,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2028 int ret; 2337 int ret;
2029 u32 new_tss_base = get_desc_base(new_desc); 2338 u32 new_tss_base = get_desc_base(new_desc);
2030 2339
2031 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2340 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2032 &ctxt->exception); 2341 &ctxt->exception);
2033 if (ret != X86EMUL_CONTINUE) 2342 if (ret != X86EMUL_CONTINUE)
2034 /* FIXME: need to provide precise fault address */ 2343 /* FIXME: need to provide precise fault address */
@@ -2036,13 +2345,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2036 2345
2037 save_state_to_tss32(ctxt, ops, &tss_seg); 2346 save_state_to_tss32(ctxt, ops, &tss_seg);
2038 2347
2039 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2348 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2040 &ctxt->exception); 2349 &ctxt->exception);
2041 if (ret != X86EMUL_CONTINUE) 2350 if (ret != X86EMUL_CONTINUE)
2042 /* FIXME: need to provide precise fault address */ 2351 /* FIXME: need to provide precise fault address */
2043 return ret; 2352 return ret;
2044 2353
2045 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2354 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2046 &ctxt->exception); 2355 &ctxt->exception);
2047 if (ret != X86EMUL_CONTINUE) 2356 if (ret != X86EMUL_CONTINUE)
2048 /* FIXME: need to provide precise fault address */ 2357 /* FIXME: need to provide precise fault address */
@@ -2051,10 +2360,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2051 if (old_tss_sel != 0xffff) { 2360 if (old_tss_sel != 0xffff) {
2052 tss_seg.prev_task_link = old_tss_sel; 2361 tss_seg.prev_task_link = old_tss_sel;
2053 2362
2054 ret = ops->write_std(new_tss_base, 2363 ret = ops->write_std(ctxt, new_tss_base,
2055 &tss_seg.prev_task_link, 2364 &tss_seg.prev_task_link,
2056 sizeof tss_seg.prev_task_link, 2365 sizeof tss_seg.prev_task_link,
2057 ctxt->vcpu, &ctxt->exception); 2366 &ctxt->exception);
2058 if (ret != X86EMUL_CONTINUE) 2367 if (ret != X86EMUL_CONTINUE)
2059 /* FIXME: need to provide precise fault address */ 2368 /* FIXME: need to provide precise fault address */
2060 return ret; 2369 return ret;
@@ -2070,9 +2379,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2070{ 2379{
2071 struct desc_struct curr_tss_desc, next_tss_desc; 2380 struct desc_struct curr_tss_desc, next_tss_desc;
2072 int ret; 2381 int ret;
2073 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2382 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
2074 ulong old_tss_base = 2383 ulong old_tss_base =
2075 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); 2384 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2076 u32 desc_limit; 2385 u32 desc_limit;
2077 2386
2078 /* FIXME: old_tss_base == ~0 ? */ 2387 /* FIXME: old_tss_base == ~0 ? */
@@ -2088,7 +2397,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2088 2397
2089 if (reason != TASK_SWITCH_IRET) { 2398 if (reason != TASK_SWITCH_IRET) {
2090 if ((tss_selector & 3) > next_tss_desc.dpl || 2399 if ((tss_selector & 3) > next_tss_desc.dpl ||
2091 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) 2400 ops->cpl(ctxt) > next_tss_desc.dpl)
2092 return emulate_gp(ctxt, 0); 2401 return emulate_gp(ctxt, 0);
2093 } 2402 }
2094 2403
@@ -2132,9 +2441,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2132 &next_tss_desc); 2441 &next_tss_desc);
2133 } 2442 }
2134 2443
2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2444 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); 2445 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2138 2446
2139 if (has_error_code) { 2447 if (has_error_code) {
2140 struct decode_cache *c = &ctxt->decode; 2448 struct decode_cache *c = &ctxt->decode;
@@ -2142,7 +2450,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2142 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2450 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2143 c->lock_prefix = 0; 2451 c->lock_prefix = 0;
2144 c->src.val = (unsigned long) error_code; 2452 c->src.val = (unsigned long) error_code;
2145 emulate_push(ctxt, ops); 2453 ret = em_push(ctxt);
2146 } 2454 }
2147 2455
2148 return ret; 2456 return ret;
@@ -2162,13 +2470,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2162 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2470 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2163 has_error_code, error_code); 2471 has_error_code, error_code);
2164 2472
2165 if (rc == X86EMUL_CONTINUE) { 2473 if (rc == X86EMUL_CONTINUE)
2166 rc = writeback(ctxt, ops); 2474 ctxt->eip = c->eip;
2167 if (rc == X86EMUL_CONTINUE)
2168 ctxt->eip = c->eip;
2169 }
2170 2475
2171 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2476 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2172} 2477}
2173 2478
2174static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2479static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
@@ -2182,12 +2487,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2182 op->addr.mem.seg = seg; 2487 op->addr.mem.seg = seg;
2183} 2488}
2184 2489
2185static int em_push(struct x86_emulate_ctxt *ctxt)
2186{
2187 emulate_push(ctxt, ctxt->ops);
2188 return X86EMUL_CONTINUE;
2189}
2190
2191static int em_das(struct x86_emulate_ctxt *ctxt) 2490static int em_das(struct x86_emulate_ctxt *ctxt)
2192{ 2491{
2193 struct decode_cache *c = &ctxt->decode; 2492 struct decode_cache *c = &ctxt->decode;
@@ -2234,7 +2533,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2234 ulong old_eip; 2533 ulong old_eip;
2235 int rc; 2534 int rc;
2236 2535
2237 old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2536 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2238 old_eip = c->eip; 2537 old_eip = c->eip;
2239 2538
2240 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2539 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
@@ -2245,20 +2544,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2245 memcpy(&c->eip, c->src.valptr, c->op_bytes); 2544 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2246 2545
2247 c->src.val = old_cs; 2546 c->src.val = old_cs;
2248 emulate_push(ctxt, ctxt->ops); 2547 rc = em_push(ctxt);
2249 rc = writeback(ctxt, ctxt->ops);
2250 if (rc != X86EMUL_CONTINUE) 2548 if (rc != X86EMUL_CONTINUE)
2251 return rc; 2549 return rc;
2252 2550
2253 c->src.val = old_eip; 2551 c->src.val = old_eip;
2254 emulate_push(ctxt, ctxt->ops); 2552 return em_push(ctxt);
2255 rc = writeback(ctxt, ctxt->ops);
2256 if (rc != X86EMUL_CONTINUE)
2257 return rc;
2258
2259 c->dst.type = OP_NONE;
2260
2261 return X86EMUL_CONTINUE;
2262} 2553}
2263 2554
2264static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 2555static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
@@ -2269,13 +2560,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2269 c->dst.type = OP_REG; 2560 c->dst.type = OP_REG;
2270 c->dst.addr.reg = &c->eip; 2561 c->dst.addr.reg = &c->eip;
2271 c->dst.bytes = c->op_bytes; 2562 c->dst.bytes = c->op_bytes;
2272 rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); 2563 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
2273 if (rc != X86EMUL_CONTINUE) 2564 if (rc != X86EMUL_CONTINUE)
2274 return rc; 2565 return rc;
2275 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); 2566 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2276 return X86EMUL_CONTINUE; 2567 return X86EMUL_CONTINUE;
2277} 2568}
2278 2569
2570static int em_add(struct x86_emulate_ctxt *ctxt)
2571{
2572 struct decode_cache *c = &ctxt->decode;
2573
2574 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2575 return X86EMUL_CONTINUE;
2576}
2577
2578static int em_or(struct x86_emulate_ctxt *ctxt)
2579{
2580 struct decode_cache *c = &ctxt->decode;
2581
2582 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2583 return X86EMUL_CONTINUE;
2584}
2585
2586static int em_adc(struct x86_emulate_ctxt *ctxt)
2587{
2588 struct decode_cache *c = &ctxt->decode;
2589
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 return X86EMUL_CONTINUE;
2592}
2593
2594static int em_sbb(struct x86_emulate_ctxt *ctxt)
2595{
2596 struct decode_cache *c = &ctxt->decode;
2597
2598 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2599 return X86EMUL_CONTINUE;
2600}
2601
2602static int em_and(struct x86_emulate_ctxt *ctxt)
2603{
2604 struct decode_cache *c = &ctxt->decode;
2605
2606 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2607 return X86EMUL_CONTINUE;
2608}
2609
2610static int em_sub(struct x86_emulate_ctxt *ctxt)
2611{
2612 struct decode_cache *c = &ctxt->decode;
2613
2614 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2615 return X86EMUL_CONTINUE;
2616}
2617
2618static int em_xor(struct x86_emulate_ctxt *ctxt)
2619{
2620 struct decode_cache *c = &ctxt->decode;
2621
2622 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2623 return X86EMUL_CONTINUE;
2624}
2625
2626static int em_cmp(struct x86_emulate_ctxt *ctxt)
2627{
2628 struct decode_cache *c = &ctxt->decode;
2629
2630 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2631 /* Disable writeback. */
2632 c->dst.type = OP_NONE;
2633 return X86EMUL_CONTINUE;
2634}
2635
2279static int em_imul(struct x86_emulate_ctxt *ctxt) 2636static int em_imul(struct x86_emulate_ctxt *ctxt)
2280{ 2637{
2281 struct decode_cache *c = &ctxt->decode; 2638 struct decode_cache *c = &ctxt->decode;
@@ -2306,13 +2663,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
2306 2663
2307static int em_rdtsc(struct x86_emulate_ctxt *ctxt) 2664static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2308{ 2665{
2309 unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
2310 struct decode_cache *c = &ctxt->decode; 2666 struct decode_cache *c = &ctxt->decode;
2311 u64 tsc = 0; 2667 u64 tsc = 0;
2312 2668
2313 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) 2669 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2314 return emulate_gp(ctxt, 0);
2315 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2316 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2670 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2317 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2671 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2318 return X86EMUL_CONTINUE; 2672 return X86EMUL_CONTINUE;
@@ -2325,22 +2679,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
2325 return X86EMUL_CONTINUE; 2679 return X86EMUL_CONTINUE;
2326} 2680}
2327 2681
2682static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2683{
2684 struct decode_cache *c = &ctxt->decode;
2685 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2686 return X86EMUL_CONTINUE;
2687}
2688
2689static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2690{
2691 struct decode_cache *c = &ctxt->decode;
2692 int rc;
2693 ulong linear;
2694
2695 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
2696 if (rc == X86EMUL_CONTINUE)
2697 ctxt->ops->invlpg(ctxt, linear);
2698 /* Disable writeback. */
2699 c->dst.type = OP_NONE;
2700 return X86EMUL_CONTINUE;
2701}
2702
2703static int em_clts(struct x86_emulate_ctxt *ctxt)
2704{
2705 ulong cr0;
2706
2707 cr0 = ctxt->ops->get_cr(ctxt, 0);
2708 cr0 &= ~X86_CR0_TS;
2709 ctxt->ops->set_cr(ctxt, 0, cr0);
2710 return X86EMUL_CONTINUE;
2711}
2712
2713static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2714{
2715 struct decode_cache *c = &ctxt->decode;
2716 int rc;
2717
2718 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2719 return X86EMUL_UNHANDLEABLE;
2720
2721 rc = ctxt->ops->fix_hypercall(ctxt);
2722 if (rc != X86EMUL_CONTINUE)
2723 return rc;
2724
2725 /* Let the processor re-execute the fixed hypercall */
2726 c->eip = ctxt->eip;
2727 /* Disable writeback. */
2728 c->dst.type = OP_NONE;
2729 return X86EMUL_CONTINUE;
2730}
2731
2732static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2733{
2734 struct decode_cache *c = &ctxt->decode;
2735 struct desc_ptr desc_ptr;
2736 int rc;
2737
2738 rc = read_descriptor(ctxt, c->src.addr.mem,
2739 &desc_ptr.size, &desc_ptr.address,
2740 c->op_bytes);
2741 if (rc != X86EMUL_CONTINUE)
2742 return rc;
2743 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2744 /* Disable writeback. */
2745 c->dst.type = OP_NONE;
2746 return X86EMUL_CONTINUE;
2747}
2748
2749static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2750{
2751 struct decode_cache *c = &ctxt->decode;
2752 int rc;
2753
2754 rc = ctxt->ops->fix_hypercall(ctxt);
2755
2756 /* Disable writeback. */
2757 c->dst.type = OP_NONE;
2758 return rc;
2759}
2760
2761static int em_lidt(struct x86_emulate_ctxt *ctxt)
2762{
2763 struct decode_cache *c = &ctxt->decode;
2764 struct desc_ptr desc_ptr;
2765 int rc;
2766
2767 rc = read_descriptor(ctxt, c->src.addr.mem,
2768 &desc_ptr.size, &desc_ptr.address,
2769 c->op_bytes);
2770 if (rc != X86EMUL_CONTINUE)
2771 return rc;
2772 ctxt->ops->set_idt(ctxt, &desc_ptr);
2773 /* Disable writeback. */
2774 c->dst.type = OP_NONE;
2775 return X86EMUL_CONTINUE;
2776}
2777
2778static int em_smsw(struct x86_emulate_ctxt *ctxt)
2779{
2780 struct decode_cache *c = &ctxt->decode;
2781
2782 c->dst.bytes = 2;
2783 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2784 return X86EMUL_CONTINUE;
2785}
2786
2787static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2788{
2789 struct decode_cache *c = &ctxt->decode;
2790 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2791 | (c->src.val & 0x0f));
2792 c->dst.type = OP_NONE;
2793 return X86EMUL_CONTINUE;
2794}
2795
2796static bool valid_cr(int nr)
2797{
2798 switch (nr) {
2799 case 0:
2800 case 2 ... 4:
2801 case 8:
2802 return true;
2803 default:
2804 return false;
2805 }
2806}
2807
2808static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2809{
2810 struct decode_cache *c = &ctxt->decode;
2811
2812 if (!valid_cr(c->modrm_reg))
2813 return emulate_ud(ctxt);
2814
2815 return X86EMUL_CONTINUE;
2816}
2817
2818static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2819{
2820 struct decode_cache *c = &ctxt->decode;
2821 u64 new_val = c->src.val64;
2822 int cr = c->modrm_reg;
2823 u64 efer = 0;
2824
2825 static u64 cr_reserved_bits[] = {
2826 0xffffffff00000000ULL,
2827 0, 0, 0, /* CR3 checked later */
2828 CR4_RESERVED_BITS,
2829 0, 0, 0,
2830 CR8_RESERVED_BITS,
2831 };
2832
2833 if (!valid_cr(cr))
2834 return emulate_ud(ctxt);
2835
2836 if (new_val & cr_reserved_bits[cr])
2837 return emulate_gp(ctxt, 0);
2838
2839 switch (cr) {
2840 case 0: {
2841 u64 cr4;
2842 if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
2843 ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
2844 return emulate_gp(ctxt, 0);
2845
2846 cr4 = ctxt->ops->get_cr(ctxt, 4);
2847 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2848
2849 if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
2850 !(cr4 & X86_CR4_PAE))
2851 return emulate_gp(ctxt, 0);
2852
2853 break;
2854 }
2855 case 3: {
2856 u64 rsvd = 0;
2857
2858 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2859 if (efer & EFER_LMA)
2860 rsvd = CR3_L_MODE_RESERVED_BITS;
2861 else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
2862 rsvd = CR3_PAE_RESERVED_BITS;
2863 else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
2864 rsvd = CR3_NONPAE_RESERVED_BITS;
2865
2866 if (new_val & rsvd)
2867 return emulate_gp(ctxt, 0);
2868
2869 break;
2870 }
2871 case 4: {
2872 u64 cr4;
2873
2874 cr4 = ctxt->ops->get_cr(ctxt, 4);
2875 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2876
2877 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
2878 return emulate_gp(ctxt, 0);
2879
2880 break;
2881 }
2882 }
2883
2884 return X86EMUL_CONTINUE;
2885}
2886
2887static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2888{
2889 unsigned long dr7;
2890
2891 ctxt->ops->get_dr(ctxt, 7, &dr7);
2892
2893 /* Check if DR7.Global_Enable is set */
2894 return dr7 & (1 << 13);
2895}
2896
2897static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2898{
2899 struct decode_cache *c = &ctxt->decode;
2900 int dr = c->modrm_reg;
2901 u64 cr4;
2902
2903 if (dr > 7)
2904 return emulate_ud(ctxt);
2905
2906 cr4 = ctxt->ops->get_cr(ctxt, 4);
2907 if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
2908 return emulate_ud(ctxt);
2909
2910 if (check_dr7_gd(ctxt))
2911 return emulate_db(ctxt);
2912
2913 return X86EMUL_CONTINUE;
2914}
2915
2916static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2917{
2918 struct decode_cache *c = &ctxt->decode;
2919 u64 new_val = c->src.val64;
2920 int dr = c->modrm_reg;
2921
2922 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2923 return emulate_gp(ctxt, 0);
2924
2925 return check_dr_read(ctxt);
2926}
2927
2928static int check_svme(struct x86_emulate_ctxt *ctxt)
2929{
2930 u64 efer;
2931
2932 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2933
2934 if (!(efer & EFER_SVME))
2935 return emulate_ud(ctxt);
2936
2937 return X86EMUL_CONTINUE;
2938}
2939
2940static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2941{
2942 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
2943
2944 /* Valid physical address? */
2945 if (rax & 0xffff000000000000ULL)
2946 return emulate_gp(ctxt, 0);
2947
2948 return check_svme(ctxt);
2949}
2950
2951static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2952{
2953 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2954
2955 if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
2956 return emulate_ud(ctxt);
2957
2958 return X86EMUL_CONTINUE;
2959}
2960
2961static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2962{
2963 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2964 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
2965
2966 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2967 (rcx > 3))
2968 return emulate_gp(ctxt, 0);
2969
2970 return X86EMUL_CONTINUE;
2971}
2972
2973static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2974{
2975 struct decode_cache *c = &ctxt->decode;
2976
2977 c->dst.bytes = min(c->dst.bytes, 4u);
2978 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2979 return emulate_gp(ctxt, 0);
2980
2981 return X86EMUL_CONTINUE;
2982}
2983
2984static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2985{
2986 struct decode_cache *c = &ctxt->decode;
2987
2988 c->src.bytes = min(c->src.bytes, 4u);
2989 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2990 return emulate_gp(ctxt, 0);
2991
2992 return X86EMUL_CONTINUE;
2993}
2994
2328#define D(_y) { .flags = (_y) } 2995#define D(_y) { .flags = (_y) }
2996#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
2997#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
2998 .check_perm = (_p) }
2329#define N D(0) 2999#define N D(0)
3000#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
2330#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } 3001#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
2331#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } 3002#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
2332#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3003#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3004#define II(_f, _e, _i) \
3005 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3006#define IIP(_f, _e, _i, _p) \
3007 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
3008 .check_perm = (_p) }
3009#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
2333 3010
2334#define D2bv(_f) D((_f) | ByteOp), D(_f) 3011#define D2bv(_f) D((_f) | ByteOp), D(_f)
3012#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
2335#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3013#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
2336 3014
2337#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ 3015#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
2338 D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ 3016 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
2339 D2bv(((_f) & ~Lock) | DstAcc | SrcImm) 3017 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
2340 3018
3019static struct opcode group7_rm1[] = {
3020 DI(SrcNone | ModRM | Priv, monitor),
3021 DI(SrcNone | ModRM | Priv, mwait),
3022 N, N, N, N, N, N,
3023};
3024
3025static struct opcode group7_rm3[] = {
3026 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
3027 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
3028 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
3029 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
3030 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
3031 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
3032 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
3033 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
3034};
3035
3036static struct opcode group7_rm7[] = {
3037 N,
3038 DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
3039 N, N, N, N, N, N,
3040};
2341 3041
2342static struct opcode group1[] = { 3042static struct opcode group1[] = {
2343 X7(D(Lock)), N 3043 I(Lock, em_add),
3044 I(Lock, em_or),
3045 I(Lock, em_adc),
3046 I(Lock, em_sbb),
3047 I(Lock, em_and),
3048 I(Lock, em_sub),
3049 I(Lock, em_xor),
3050 I(0, em_cmp),
2344}; 3051};
2345 3052
2346static struct opcode group1A[] = { 3053static struct opcode group1A[] = {
@@ -2366,16 +3073,28 @@ static struct opcode group5[] = {
2366 D(SrcMem | ModRM | Stack), N, 3073 D(SrcMem | ModRM | Stack), N,
2367}; 3074};
2368 3075
3076static struct opcode group6[] = {
3077 DI(ModRM | Prot, sldt),
3078 DI(ModRM | Prot, str),
3079 DI(ModRM | Prot | Priv, lldt),
3080 DI(ModRM | Prot | Priv, ltr),
3081 N, N, N, N,
3082};
3083
2369static struct group_dual group7 = { { 3084static struct group_dual group7 = { {
2370 N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), 3085 DI(ModRM | Mov | DstMem | Priv, sgdt),
2371 D(SrcNone | ModRM | DstMem | Mov), N, 3086 DI(ModRM | Mov | DstMem | Priv, sidt),
2372 D(SrcMem16 | ModRM | Mov | Priv), 3087 II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 3088 II(ModRM | SrcMem | Priv, em_lidt, lidt),
3089 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3090 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
3091 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
2374}, { 3092}, {
2375 D(SrcNone | ModRM | Priv | VendorSpecific), N, 3093 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific), 3094 EXT(0, group7_rm1),
2377 D(SrcNone | ModRM | DstMem | Mov), N, 3095 N, EXT(0, group7_rm3),
2378 D(SrcMem16 | ModRM | Mov | Priv), N, 3096 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3097 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
2379} }; 3098} };
2380 3099
2381static struct opcode group8[] = { 3100static struct opcode group8[] = {
@@ -2394,35 +3113,40 @@ static struct opcode group11[] = {
2394 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), 3113 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
2395}; 3114};
2396 3115
3116static struct gprefix pfx_0f_6f_0f_7f = {
3117 N, N, N, I(Sse, em_movdqu),
3118};
3119
2397static struct opcode opcode_table[256] = { 3120static struct opcode opcode_table[256] = {
2398 /* 0x00 - 0x07 */ 3121 /* 0x00 - 0x07 */
2399 D6ALU(Lock), 3122 I6ALU(Lock, em_add),
2400 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3123 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2401 /* 0x08 - 0x0F */ 3124 /* 0x08 - 0x0F */
2402 D6ALU(Lock), 3125 I6ALU(Lock, em_or),
2403 D(ImplicitOps | Stack | No64), N, 3126 D(ImplicitOps | Stack | No64), N,
2404 /* 0x10 - 0x17 */ 3127 /* 0x10 - 0x17 */
2405 D6ALU(Lock), 3128 I6ALU(Lock, em_adc),
2406 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3129 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2407 /* 0x18 - 0x1F */ 3130 /* 0x18 - 0x1F */
2408 D6ALU(Lock), 3131 I6ALU(Lock, em_sbb),
2409 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3132 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2410 /* 0x20 - 0x27 */ 3133 /* 0x20 - 0x27 */
2411 D6ALU(Lock), N, N, 3134 I6ALU(Lock, em_and), N, N,
2412 /* 0x28 - 0x2F */ 3135 /* 0x28 - 0x2F */
2413 D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), 3136 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
2414 /* 0x30 - 0x37 */ 3137 /* 0x30 - 0x37 */
2415 D6ALU(Lock), N, N, 3138 I6ALU(Lock, em_xor), N, N,
2416 /* 0x38 - 0x3F */ 3139 /* 0x38 - 0x3F */
2417 D6ALU(0), N, N, 3140 I6ALU(0, em_cmp), N, N,
2418 /* 0x40 - 0x4F */ 3141 /* 0x40 - 0x4F */
2419 X16(D(DstReg)), 3142 X16(D(DstReg)),
2420 /* 0x50 - 0x57 */ 3143 /* 0x50 - 0x57 */
2421 X8(I(SrcReg | Stack, em_push)), 3144 X8(I(SrcReg | Stack, em_push)),
2422 /* 0x58 - 0x5F */ 3145 /* 0x58 - 0x5F */
2423 X8(D(DstReg | Stack)), 3146 X8(I(DstReg | Stack, em_pop)),
2424 /* 0x60 - 0x67 */ 3147 /* 0x60 - 0x67 */
2425 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3148 I(ImplicitOps | Stack | No64, em_pusha),
3149 I(ImplicitOps | Stack | No64, em_popa),
2426 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , 3150 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
2427 N, N, N, N, 3151 N, N, N, N,
2428 /* 0x68 - 0x6F */ 3152 /* 0x68 - 0x6F */
@@ -2430,8 +3154,8 @@ static struct opcode opcode_table[256] = {
2430 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3154 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
2431 I(SrcImmByte | Mov | Stack, em_push), 3155 I(SrcImmByte | Mov | Stack, em_push),
2432 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3156 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
2433 D2bv(DstDI | Mov | String), /* insb, insw/insd */ 3157 D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */
2434 D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ 3158 D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */
2435 /* 0x70 - 0x7F */ 3159 /* 0x70 - 0x7F */
2436 X16(D(SrcImmByte)), 3160 X16(D(SrcImmByte)),
2437 /* 0x80 - 0x87 */ 3161 /* 0x80 - 0x87 */
@@ -2446,21 +3170,22 @@ static struct opcode opcode_table[256] = {
2446 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), 3170 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
2447 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), 3171 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
2448 /* 0x90 - 0x97 */ 3172 /* 0x90 - 0x97 */
2449 X8(D(SrcAcc | DstReg)), 3173 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
2450 /* 0x98 - 0x9F */ 3174 /* 0x98 - 0x9F */
2451 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3175 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
2452 I(SrcImmFAddr | No64, em_call_far), N, 3176 I(SrcImmFAddr | No64, em_call_far), N,
2453 D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, 3177 II(ImplicitOps | Stack, em_pushf, pushf),
3178 II(ImplicitOps | Stack, em_popf, popf), N, N,
2454 /* 0xA0 - 0xA7 */ 3179 /* 0xA0 - 0xA7 */
2455 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3180 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
2456 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), 3181 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
2457 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3182 I2bv(SrcSI | DstDI | Mov | String, em_mov),
2458 D2bv(SrcSI | DstDI | String), 3183 I2bv(SrcSI | DstDI | String, em_cmp),
2459 /* 0xA8 - 0xAF */ 3184 /* 0xA8 - 0xAF */
2460 D2bv(DstAcc | SrcImm), 3185 D2bv(DstAcc | SrcImm),
2461 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3186 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
2462 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3187 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
2463 D2bv(SrcAcc | DstDI | String), 3188 I2bv(SrcAcc | DstDI | String, em_cmp),
2464 /* 0xB0 - 0xB7 */ 3189 /* 0xB0 - 0xB7 */
2465 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), 3190 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
2466 /* 0xB8 - 0xBF */ 3191 /* 0xB8 - 0xBF */
@@ -2473,7 +3198,8 @@ static struct opcode opcode_table[256] = {
2473 G(ByteOp, group11), G(0, group11), 3198 G(ByteOp, group11), G(0, group11),
2474 /* 0xC8 - 0xCF */ 3199 /* 0xC8 - 0xCF */
2475 N, N, N, D(ImplicitOps | Stack), 3200 N, N, N, D(ImplicitOps | Stack),
2476 D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), 3201 D(ImplicitOps), DI(SrcImmByte, intn),
3202 D(ImplicitOps | No64), DI(ImplicitOps, iret),
2477 /* 0xD0 - 0xD7 */ 3203 /* 0xD0 - 0xD7 */
2478 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3204 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
2479 N, N, N, N, 3205 N, N, N, N,
@@ -2481,14 +3207,17 @@ static struct opcode opcode_table[256] = {
2481 N, N, N, N, N, N, N, N, 3207 N, N, N, N, N, N, N, N,
2482 /* 0xE0 - 0xE7 */ 3208 /* 0xE0 - 0xE7 */
2483 X4(D(SrcImmByte)), 3209 X4(D(SrcImmByte)),
2484 D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), 3210 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3211 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
2485 /* 0xE8 - 0xEF */ 3212 /* 0xE8 - 0xEF */
2486 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3213 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
2487 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), 3214 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
2488 D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), 3215 D2bvIP(SrcNone | DstAcc, in, check_perm_in),
3216 D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out),
2489 /* 0xF0 - 0xF7 */ 3217 /* 0xF0 - 0xF7 */
2490 N, N, N, N, 3218 N, DI(ImplicitOps, icebp), N, N,
2491 D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), 3219 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3220 G(ByteOp, group3), G(0, group3),
2492 /* 0xF8 - 0xFF */ 3221 /* 0xF8 - 0xFF */
2493 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), 3222 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
2494 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3223 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
@@ -2496,20 +3225,24 @@ static struct opcode opcode_table[256] = {
2496 3225
2497static struct opcode twobyte_table[256] = { 3226static struct opcode twobyte_table[256] = {
2498 /* 0x00 - 0x0F */ 3227 /* 0x00 - 0x0F */
2499 N, GD(0, &group7), N, N, 3228 G(0, group6), GD(0, &group7), N, N,
2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, 3229 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 3230 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
2502 N, D(ImplicitOps | ModRM), N, N, 3231 N, D(ImplicitOps | ModRM), N, N,
2503 /* 0x10 - 0x1F */ 3232 /* 0x10 - 0x1F */
2504 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, 3233 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
2505 /* 0x20 - 0x2F */ 3234 /* 0x20 - 0x2F */
2506 D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), 3235 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
2507 D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), 3236 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3237 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
3238 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
2508 N, N, N, N, 3239 N, N, N, N,
2509 N, N, N, N, N, N, N, N, 3240 N, N, N, N, N, N, N, N,
2510 /* 0x30 - 0x3F */ 3241 /* 0x30 - 0x3F */
2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 3242 DI(ImplicitOps | Priv, wrmsr),
2512 D(ImplicitOps | Priv), N, 3243 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3244 DI(ImplicitOps | Priv, rdmsr),
3245 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), 3246 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N, 3247 N, N,
2515 N, N, N, N, N, N, N, N, 3248 N, N, N, N, N, N, N, N,
@@ -2518,21 +3251,27 @@ static struct opcode twobyte_table[256] = {
2518 /* 0x50 - 0x5F */ 3251 /* 0x50 - 0x5F */
2519 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3252 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2520 /* 0x60 - 0x6F */ 3253 /* 0x60 - 0x6F */
2521 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3254 N, N, N, N,
3255 N, N, N, N,
3256 N, N, N, N,
3257 N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
2522 /* 0x70 - 0x7F */ 3258 /* 0x70 - 0x7F */
2523 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3259 N, N, N, N,
3260 N, N, N, N,
3261 N, N, N, N,
3262 N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
2524 /* 0x80 - 0x8F */ 3263 /* 0x80 - 0x8F */
2525 X16(D(SrcImm)), 3264 X16(D(SrcImm)),
2526 /* 0x90 - 0x9F */ 3265 /* 0x90 - 0x9F */
2527 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3266 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
2528 /* 0xA0 - 0xA7 */ 3267 /* 0xA0 - 0xA7 */
2529 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3268 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2530 N, D(DstMem | SrcReg | ModRM | BitOp), 3269 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
2531 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3270 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2532 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3271 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
2533 /* 0xA8 - 0xAF */ 3272 /* 0xA8 - 0xAF */
2534 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3273 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2535 N, D(DstMem | SrcReg | ModRM | BitOp | Lock), 3274 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2536 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3275 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2537 D(DstMem | SrcReg | Src2CL | ModRM), 3276 D(DstMem | SrcReg | Src2CL | ModRM),
2538 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3277 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
@@ -2564,10 +3303,13 @@ static struct opcode twobyte_table[256] = {
2564#undef G 3303#undef G
2565#undef GD 3304#undef GD
2566#undef I 3305#undef I
3306#undef GP
3307#undef EXT
2567 3308
2568#undef D2bv 3309#undef D2bv
3310#undef D2bvIP
2569#undef I2bv 3311#undef I2bv
2570#undef D6ALU 3312#undef I6ALU
2571 3313
2572static unsigned imm_size(struct decode_cache *c) 3314static unsigned imm_size(struct decode_cache *c)
2573{ 3315{
@@ -2625,8 +3367,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2625 struct decode_cache *c = &ctxt->decode; 3367 struct decode_cache *c = &ctxt->decode;
2626 int rc = X86EMUL_CONTINUE; 3368 int rc = X86EMUL_CONTINUE;
2627 int mode = ctxt->mode; 3369 int mode = ctxt->mode;
2628 int def_op_bytes, def_ad_bytes, dual, goffset; 3370 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
2629 struct opcode opcode, *g_mod012, *g_mod3; 3371 bool op_prefix = false;
3372 struct opcode opcode;
2630 struct operand memop = { .type = OP_NONE }; 3373 struct operand memop = { .type = OP_NONE };
2631 3374
2632 c->eip = ctxt->eip; 3375 c->eip = ctxt->eip;
@@ -2634,7 +3377,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2634 c->fetch.end = c->fetch.start + insn_len; 3377 c->fetch.end = c->fetch.start + insn_len;
2635 if (insn_len > 0) 3378 if (insn_len > 0)
2636 memcpy(c->fetch.data, insn, insn_len); 3379 memcpy(c->fetch.data, insn, insn_len);
2637 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2638 3380
2639 switch (mode) { 3381 switch (mode) {
2640 case X86EMUL_MODE_REAL: 3382 case X86EMUL_MODE_REAL:
@@ -2662,6 +3404,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2662 for (;;) { 3404 for (;;) {
2663 switch (c->b = insn_fetch(u8, 1, c->eip)) { 3405 switch (c->b = insn_fetch(u8, 1, c->eip)) {
2664 case 0x66: /* operand-size override */ 3406 case 0x66: /* operand-size override */
3407 op_prefix = true;
2665 /* switch between 2/4 bytes */ 3408 /* switch between 2/4 bytes */
2666 c->op_bytes = def_op_bytes ^ 6; 3409 c->op_bytes = def_op_bytes ^ 6;
2667 break; 3410 break;
@@ -2692,10 +3435,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2692 c->lock_prefix = 1; 3435 c->lock_prefix = 1;
2693 break; 3436 break;
2694 case 0xf2: /* REPNE/REPNZ */ 3437 case 0xf2: /* REPNE/REPNZ */
2695 c->rep_prefix = REPNE_PREFIX;
2696 break;
2697 case 0xf3: /* REP/REPE/REPZ */ 3438 case 0xf3: /* REP/REPE/REPZ */
2698 c->rep_prefix = REPE_PREFIX; 3439 c->rep_prefix = c->b;
2699 break; 3440 break;
2700 default: 3441 default:
2701 goto done_prefixes; 3442 goto done_prefixes;
@@ -2722,29 +3463,49 @@ done_prefixes:
2722 } 3463 }
2723 c->d = opcode.flags; 3464 c->d = opcode.flags;
2724 3465
2725 if (c->d & Group) { 3466 while (c->d & GroupMask) {
2726 dual = c->d & GroupDual; 3467 switch (c->d & GroupMask) {
2727 c->modrm = insn_fetch(u8, 1, c->eip); 3468 case Group:
2728 --c->eip; 3469 c->modrm = insn_fetch(u8, 1, c->eip);
2729 3470 --c->eip;
2730 if (c->d & GroupDual) { 3471 goffset = (c->modrm >> 3) & 7;
2731 g_mod012 = opcode.u.gdual->mod012; 3472 opcode = opcode.u.group[goffset];
2732 g_mod3 = opcode.u.gdual->mod3; 3473 break;
2733 } else 3474 case GroupDual:
2734 g_mod012 = g_mod3 = opcode.u.group; 3475 c->modrm = insn_fetch(u8, 1, c->eip);
2735 3476 --c->eip;
2736 c->d &= ~(Group | GroupDual); 3477 goffset = (c->modrm >> 3) & 7;
2737 3478 if ((c->modrm >> 6) == 3)
2738 goffset = (c->modrm >> 3) & 7; 3479 opcode = opcode.u.gdual->mod3[goffset];
3480 else
3481 opcode = opcode.u.gdual->mod012[goffset];
3482 break;
3483 case RMExt:
3484 goffset = c->modrm & 7;
3485 opcode = opcode.u.group[goffset];
3486 break;
3487 case Prefix:
3488 if (c->rep_prefix && op_prefix)
3489 return X86EMUL_UNHANDLEABLE;
3490 simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
3491 switch (simd_prefix) {
3492 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3493 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
3494 case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
3495 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
3496 }
3497 break;
3498 default:
3499 return X86EMUL_UNHANDLEABLE;
3500 }
2739 3501
2740 if ((c->modrm >> 6) == 3) 3502 c->d &= ~GroupMask;
2741 opcode = g_mod3[goffset];
2742 else
2743 opcode = g_mod012[goffset];
2744 c->d |= opcode.flags; 3503 c->d |= opcode.flags;
2745 } 3504 }
2746 3505
2747 c->execute = opcode.u.execute; 3506 c->execute = opcode.u.execute;
3507 c->check_perm = opcode.check_perm;
3508 c->intercept = opcode.intercept;
2748 3509
2749 /* Unrecognised? */ 3510 /* Unrecognised? */
2750 if (c->d == 0 || (c->d & Undefined)) 3511 if (c->d == 0 || (c->d & Undefined))
@@ -2763,6 +3524,9 @@ done_prefixes:
2763 c->op_bytes = 4; 3524 c->op_bytes = 4;
2764 } 3525 }
2765 3526
3527 if (c->d & Sse)
3528 c->op_bytes = 16;
3529
2766 /* ModRM and SIB bytes. */ 3530 /* ModRM and SIB bytes. */
2767 if (c->d & ModRM) { 3531 if (c->d & ModRM) {
2768 rc = decode_modrm(ctxt, ops, &memop); 3532 rc = decode_modrm(ctxt, ops, &memop);
@@ -2776,7 +3540,7 @@ done_prefixes:
2776 if (!c->has_seg_override) 3540 if (!c->has_seg_override)
2777 set_seg_override(c, VCPU_SREG_DS); 3541 set_seg_override(c, VCPU_SREG_DS);
2778 3542
2779 memop.addr.mem.seg = seg_override(ctxt, ops, c); 3543 memop.addr.mem.seg = seg_override(ctxt, c);
2780 3544
2781 if (memop.type == OP_MEM && c->ad_bytes != 8) 3545 if (memop.type == OP_MEM && c->ad_bytes != 8)
2782 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3546 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
@@ -2792,7 +3556,7 @@ done_prefixes:
2792 case SrcNone: 3556 case SrcNone:
2793 break; 3557 break;
2794 case SrcReg: 3558 case SrcReg:
2795 decode_register_operand(&c->src, c, 0); 3559 decode_register_operand(ctxt, &c->src, c, 0);
2796 break; 3560 break;
2797 case SrcMem16: 3561 case SrcMem16:
2798 memop.bytes = 2; 3562 memop.bytes = 2;
@@ -2836,7 +3600,7 @@ done_prefixes:
2836 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2837 c->src.addr.mem.ea = 3601 c->src.addr.mem.ea =
2838 register_address(c, c->regs[VCPU_REGS_RSI]); 3602 register_address(c, c->regs[VCPU_REGS_RSI]);
2839 c->src.addr.mem.seg = seg_override(ctxt, ops, c), 3603 c->src.addr.mem.seg = seg_override(ctxt, c);
2840 c->src.val = 0; 3604 c->src.val = 0;
2841 break; 3605 break;
2842 case SrcImmFAddr: 3606 case SrcImmFAddr:
@@ -2883,7 +3647,7 @@ done_prefixes:
2883 /* Decode and fetch the destination operand: register or memory. */ 3647 /* Decode and fetch the destination operand: register or memory. */
2884 switch (c->d & DstMask) { 3648 switch (c->d & DstMask) {
2885 case DstReg: 3649 case DstReg:
2886 decode_register_operand(&c->dst, c, 3650 decode_register_operand(ctxt, &c->dst, c,
2887 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 3651 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
2888 break; 3652 break;
2889 case DstImmUByte: 3653 case DstImmUByte:
@@ -2926,7 +3690,7 @@ done_prefixes:
2926 } 3690 }
2927 3691
2928done: 3692done:
2929 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3693 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2930} 3694}
2931 3695
2932static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3696static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -2979,12 +3743,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
2979 goto done; 3743 goto done;
2980 } 3744 }
2981 3745
3746 if ((c->d & Sse)
3747 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3748 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3749 rc = emulate_ud(ctxt);
3750 goto done;
3751 }
3752
3753 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3754 rc = emulate_nm(ctxt);
3755 goto done;
3756 }
3757
3758 if (unlikely(ctxt->guest_mode) && c->intercept) {
3759 rc = emulator_check_intercept(ctxt, c->intercept,
3760 X86_ICPT_PRE_EXCEPT);
3761 if (rc != X86EMUL_CONTINUE)
3762 goto done;
3763 }
3764
2982 /* Privileged instruction can be executed only in CPL=0 */ 3765 /* Privileged instruction can be executed only in CPL=0 */
2983 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3766 if ((c->d & Priv) && ops->cpl(ctxt)) {
2984 rc = emulate_gp(ctxt, 0); 3767 rc = emulate_gp(ctxt, 0);
2985 goto done; 3768 goto done;
2986 } 3769 }
2987 3770
3771 /* Instruction can only be executed in protected mode */
3772 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3773 rc = emulate_ud(ctxt);
3774 goto done;
3775 }
3776
3777 /* Do instruction specific permission checks */
3778 if (c->check_perm) {
3779 rc = c->check_perm(ctxt);
3780 if (rc != X86EMUL_CONTINUE)
3781 goto done;
3782 }
3783
3784 if (unlikely(ctxt->guest_mode) && c->intercept) {
3785 rc = emulator_check_intercept(ctxt, c->intercept,
3786 X86_ICPT_POST_EXCEPT);
3787 if (rc != X86EMUL_CONTINUE)
3788 goto done;
3789 }
3790
2988 if (c->rep_prefix && (c->d & String)) { 3791 if (c->rep_prefix && (c->d & String)) {
2989 /* All REP prefixes have the same first termination condition */ 3792 /* All REP prefixes have the same first termination condition */
2990 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3793 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -2994,16 +3797,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
2994 } 3797 }
2995 3798
2996 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 3799 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2997 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), 3800 rc = segmented_read(ctxt, c->src.addr.mem,
2998 c->src.valptr, c->src.bytes); 3801 c->src.valptr, c->src.bytes);
2999 if (rc != X86EMUL_CONTINUE) 3802 if (rc != X86EMUL_CONTINUE)
3000 goto done; 3803 goto done;
3001 c->src.orig_val64 = c->src.val64; 3804 c->src.orig_val64 = c->src.val64;
3002 } 3805 }
3003 3806
3004 if (c->src2.type == OP_MEM) { 3807 if (c->src2.type == OP_MEM) {
3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), 3808 rc = segmented_read(ctxt, c->src2.addr.mem,
3006 &c->src2.val, c->src2.bytes); 3809 &c->src2.val, c->src2.bytes);
3007 if (rc != X86EMUL_CONTINUE) 3810 if (rc != X86EMUL_CONTINUE)
3008 goto done; 3811 goto done;
3009 } 3812 }
@@ -3014,7 +3817,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3014 3817
3015 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3818 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3016 /* optimisation - avoid slow emulated read if Mov */ 3819 /* optimisation - avoid slow emulated read if Mov */
3017 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), 3820 rc = segmented_read(ctxt, c->dst.addr.mem,
3018 &c->dst.val, c->dst.bytes); 3821 &c->dst.val, c->dst.bytes);
3019 if (rc != X86EMUL_CONTINUE) 3822 if (rc != X86EMUL_CONTINUE)
3020 goto done; 3823 goto done;
@@ -3023,6 +3826,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3023 3826
3024special_insn: 3827special_insn:
3025 3828
3829 if (unlikely(ctxt->guest_mode) && c->intercept) {
3830 rc = emulator_check_intercept(ctxt, c->intercept,
3831 X86_ICPT_POST_MEMACCESS);
3832 if (rc != X86EMUL_CONTINUE)
3833 goto done;
3834 }
3835
3026 if (c->execute) { 3836 if (c->execute) {
3027 rc = c->execute(ctxt); 3837 rc = c->execute(ctxt);
3028 if (rc != X86EMUL_CONTINUE) 3838 if (rc != X86EMUL_CONTINUE)
@@ -3034,75 +3844,33 @@ special_insn:
3034 goto twobyte_insn; 3844 goto twobyte_insn;
3035 3845
3036 switch (c->b) { 3846 switch (c->b) {
3037 case 0x00 ... 0x05:
3038 add: /* add */
3039 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
3040 break;
3041 case 0x06: /* push es */ 3847 case 0x06: /* push es */
3042 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3848 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
3043 break; 3849 break;
3044 case 0x07: /* pop es */ 3850 case 0x07: /* pop es */
3045 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3851 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
3046 break; 3852 break;
3047 case 0x08 ... 0x0d:
3048 or: /* or */
3049 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
3050 break;
3051 case 0x0e: /* push cs */ 3853 case 0x0e: /* push cs */
3052 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3854 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
3053 break;
3054 case 0x10 ... 0x15:
3055 adc: /* adc */
3056 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
3057 break; 3855 break;
3058 case 0x16: /* push ss */ 3856 case 0x16: /* push ss */
3059 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3857 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
3060 break; 3858 break;
3061 case 0x17: /* pop ss */ 3859 case 0x17: /* pop ss */
3062 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3860 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
3063 break; 3861 break;
3064 case 0x18 ... 0x1d:
3065 sbb: /* sbb */
3066 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
3067 break;
3068 case 0x1e: /* push ds */ 3862 case 0x1e: /* push ds */
3069 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3863 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
3070 break; 3864 break;
3071 case 0x1f: /* pop ds */ 3865 case 0x1f: /* pop ds */
3072 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3866 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
3073 break; 3867 break;
3074 case 0x20 ... 0x25:
3075 and: /* and */
3076 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
3077 break;
3078 case 0x28 ... 0x2d:
3079 sub: /* sub */
3080 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
3081 break;
3082 case 0x30 ... 0x35:
3083 xor: /* xor */
3084 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
3085 break;
3086 case 0x38 ... 0x3d:
3087 cmp: /* cmp */
3088 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
3089 break;
3090 case 0x40 ... 0x47: /* inc r16/r32 */ 3868 case 0x40 ... 0x47: /* inc r16/r32 */
3091 emulate_1op("inc", c->dst, ctxt->eflags); 3869 emulate_1op("inc", c->dst, ctxt->eflags);
3092 break; 3870 break;
3093 case 0x48 ... 0x4f: /* dec r16/r32 */ 3871 case 0x48 ... 0x4f: /* dec r16/r32 */
3094 emulate_1op("dec", c->dst, ctxt->eflags); 3872 emulate_1op("dec", c->dst, ctxt->eflags);
3095 break; 3873 break;
3096 case 0x58 ... 0x5f: /* pop reg */
3097 pop_instruction:
3098 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
3099 break;
3100 case 0x60: /* pusha */
3101 rc = emulate_pusha(ctxt, ops);
3102 break;
3103 case 0x61: /* popa */
3104 rc = emulate_popa(ctxt, ops);
3105 break;
3106 case 0x63: /* movsxd */ 3874 case 0x63: /* movsxd */
3107 if (ctxt->mode != X86EMUL_MODE_PROT64) 3875 if (ctxt->mode != X86EMUL_MODE_PROT64)
3108 goto cannot_emulate; 3876 goto cannot_emulate;
@@ -3121,26 +3889,6 @@ special_insn:
3121 if (test_cc(c->b, ctxt->eflags)) 3889 if (test_cc(c->b, ctxt->eflags))
3122 jmp_rel(c, c->src.val); 3890 jmp_rel(c, c->src.val);
3123 break; 3891 break;
3124 case 0x80 ... 0x83: /* Grp1 */
3125 switch (c->modrm_reg) {
3126 case 0:
3127 goto add;
3128 case 1:
3129 goto or;
3130 case 2:
3131 goto adc;
3132 case 3:
3133 goto sbb;
3134 case 4:
3135 goto and;
3136 case 5:
3137 goto sub;
3138 case 6:
3139 goto xor;
3140 case 7:
3141 goto cmp;
3142 }
3143 break;
3144 case 0x84 ... 0x85: 3892 case 0x84 ... 0x85:
3145 test: 3893 test:
3146 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 3894 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
@@ -3162,7 +3910,7 @@ special_insn:
3162 rc = emulate_ud(ctxt); 3910 rc = emulate_ud(ctxt);
3163 goto done; 3911 goto done;
3164 } 3912 }
3165 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3913 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
3166 break; 3914 break;
3167 case 0x8d: /* lea r16/r32, m */ 3915 case 0x8d: /* lea r16/r32, m */
3168 c->dst.val = c->src.addr.mem.ea; 3916 c->dst.val = c->src.addr.mem.ea;
@@ -3187,7 +3935,7 @@ special_insn:
3187 break; 3935 break;
3188 } 3936 }
3189 case 0x8f: /* pop (sole member of Grp1a) */ 3937 case 0x8f: /* pop (sole member of Grp1a) */
3190 rc = emulate_grp1a(ctxt, ops); 3938 rc = em_grp1a(ctxt);
3191 break; 3939 break;
3192 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 3940 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3193 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) 3941 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
@@ -3200,31 +3948,17 @@ special_insn:
3200 case 8: c->dst.val = (s32)c->dst.val; break; 3948 case 8: c->dst.val = (s32)c->dst.val; break;
3201 } 3949 }
3202 break; 3950 break;
3203 case 0x9c: /* pushf */
3204 c->src.val = (unsigned long) ctxt->eflags;
3205 emulate_push(ctxt, ops);
3206 break;
3207 case 0x9d: /* popf */
3208 c->dst.type = OP_REG;
3209 c->dst.addr.reg = &ctxt->eflags;
3210 c->dst.bytes = c->op_bytes;
3211 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
3212 break;
3213 case 0xa6 ... 0xa7: /* cmps */
3214 c->dst.type = OP_NONE; /* Disable writeback. */
3215 goto cmp;
3216 case 0xa8 ... 0xa9: /* test ax, imm */ 3951 case 0xa8 ... 0xa9: /* test ax, imm */
3217 goto test; 3952 goto test;
3218 case 0xae ... 0xaf: /* scas */
3219 goto cmp;
3220 case 0xc0 ... 0xc1: 3953 case 0xc0 ... 0xc1:
3221 emulate_grp2(ctxt); 3954 rc = em_grp2(ctxt);
3222 break; 3955 break;
3223 case 0xc3: /* ret */ 3956 case 0xc3: /* ret */
3224 c->dst.type = OP_REG; 3957 c->dst.type = OP_REG;
3225 c->dst.addr.reg = &c->eip; 3958 c->dst.addr.reg = &c->eip;
3226 c->dst.bytes = c->op_bytes; 3959 c->dst.bytes = c->op_bytes;
3227 goto pop_instruction; 3960 rc = em_pop(ctxt);
3961 break;
3228 case 0xc4: /* les */ 3962 case 0xc4: /* les */
3229 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); 3963 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
3230 break; 3964 break;
@@ -3252,11 +3986,11 @@ special_insn:
3252 rc = emulate_iret(ctxt, ops); 3986 rc = emulate_iret(ctxt, ops);
3253 break; 3987 break;
3254 case 0xd0 ... 0xd1: /* Grp2 */ 3988 case 0xd0 ... 0xd1: /* Grp2 */
3255 emulate_grp2(ctxt); 3989 rc = em_grp2(ctxt);
3256 break; 3990 break;
3257 case 0xd2 ... 0xd3: /* Grp2 */ 3991 case 0xd2 ... 0xd3: /* Grp2 */
3258 c->src.val = c->regs[VCPU_REGS_RCX]; 3992 c->src.val = c->regs[VCPU_REGS_RCX];
3259 emulate_grp2(ctxt); 3993 rc = em_grp2(ctxt);
3260 break; 3994 break;
3261 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ 3995 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
3262 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 3996 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
@@ -3278,23 +4012,14 @@ special_insn:
3278 long int rel = c->src.val; 4012 long int rel = c->src.val;
3279 c->src.val = (unsigned long) c->eip; 4013 c->src.val = (unsigned long) c->eip;
3280 jmp_rel(c, rel); 4014 jmp_rel(c, rel);
3281 emulate_push(ctxt, ops); 4015 rc = em_push(ctxt);
3282 break; 4016 break;
3283 } 4017 }
3284 case 0xe9: /* jmp rel */ 4018 case 0xe9: /* jmp rel */
3285 goto jmp; 4019 goto jmp;
3286 case 0xea: { /* jmp far */ 4020 case 0xea: /* jmp far */
3287 unsigned short sel; 4021 rc = em_jmp_far(ctxt);
3288 jump_far:
3289 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
3290
3291 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
3292 goto done;
3293
3294 c->eip = 0;
3295 memcpy(&c->eip, c->src.valptr, c->op_bytes);
3296 break; 4022 break;
3297 }
3298 case 0xeb: 4023 case 0xeb:
3299 jmp: /* jmp rel short */ 4024 jmp: /* jmp rel short */
3300 jmp_rel(c, c->src.val); 4025 jmp_rel(c, c->src.val);
@@ -3304,11 +4029,6 @@ special_insn:
3304 case 0xed: /* in (e/r)ax,dx */ 4029 case 0xed: /* in (e/r)ax,dx */
3305 c->src.val = c->regs[VCPU_REGS_RDX]; 4030 c->src.val = c->regs[VCPU_REGS_RDX];
3306 do_io_in: 4031 do_io_in:
3307 c->dst.bytes = min(c->dst.bytes, 4u);
3308 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3309 rc = emulate_gp(ctxt, 0);
3310 goto done;
3311 }
3312 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 4032 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
3313 &c->dst.val)) 4033 &c->dst.val))
3314 goto done; /* IO is needed */ 4034 goto done; /* IO is needed */
@@ -3317,25 +4037,19 @@ special_insn:
3317 case 0xef: /* out dx,(e/r)ax */ 4037 case 0xef: /* out dx,(e/r)ax */
3318 c->dst.val = c->regs[VCPU_REGS_RDX]; 4038 c->dst.val = c->regs[VCPU_REGS_RDX];
3319 do_io_out: 4039 do_io_out:
3320 c->src.bytes = min(c->src.bytes, 4u); 4040 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
3321 if (!emulator_io_permited(ctxt, ops, c->dst.val, 4041 &c->src.val, 1);
3322 c->src.bytes)) {
3323 rc = emulate_gp(ctxt, 0);
3324 goto done;
3325 }
3326 ops->pio_out_emulated(c->src.bytes, c->dst.val,
3327 &c->src.val, 1, ctxt->vcpu);
3328 c->dst.type = OP_NONE; /* Disable writeback. */ 4042 c->dst.type = OP_NONE; /* Disable writeback. */
3329 break; 4043 break;
3330 case 0xf4: /* hlt */ 4044 case 0xf4: /* hlt */
3331 ctxt->vcpu->arch.halt_request = 1; 4045 ctxt->ops->halt(ctxt);
3332 break; 4046 break;
3333 case 0xf5: /* cmc */ 4047 case 0xf5: /* cmc */
3334 /* complement carry flag from eflags reg */ 4048 /* complement carry flag from eflags reg */
3335 ctxt->eflags ^= EFLG_CF; 4049 ctxt->eflags ^= EFLG_CF;
3336 break; 4050 break;
3337 case 0xf6 ... 0xf7: /* Grp3 */ 4051 case 0xf6 ... 0xf7: /* Grp3 */
3338 rc = emulate_grp3(ctxt, ops); 4052 rc = em_grp3(ctxt);
3339 break; 4053 break;
3340 case 0xf8: /* clc */ 4054 case 0xf8: /* clc */
3341 ctxt->eflags &= ~EFLG_CF; 4055 ctxt->eflags &= ~EFLG_CF;
@@ -3366,13 +4080,11 @@ special_insn:
3366 ctxt->eflags |= EFLG_DF; 4080 ctxt->eflags |= EFLG_DF;
3367 break; 4081 break;
3368 case 0xfe: /* Grp4 */ 4082 case 0xfe: /* Grp4 */
3369 grp45: 4083 rc = em_grp45(ctxt);
3370 rc = emulate_grp45(ctxt, ops);
3371 break; 4084 break;
3372 case 0xff: /* Grp5 */ 4085 case 0xff: /* Grp5 */
3373 if (c->modrm_reg == 5) 4086 rc = em_grp45(ctxt);
3374 goto jump_far; 4087 break;
3375 goto grp45;
3376 default: 4088 default:
3377 goto cannot_emulate; 4089 goto cannot_emulate;
3378 } 4090 }
@@ -3381,7 +4093,7 @@ special_insn:
3381 goto done; 4093 goto done;
3382 4094
3383writeback: 4095writeback:
3384 rc = writeback(ctxt, ops); 4096 rc = writeback(ctxt);
3385 if (rc != X86EMUL_CONTINUE) 4097 if (rc != X86EMUL_CONTINUE)
3386 goto done; 4098 goto done;
3387 4099
@@ -3392,7 +4104,7 @@ writeback:
3392 c->dst.type = saved_dst_type; 4104 c->dst.type = saved_dst_type;
3393 4105
3394 if ((c->d & SrcMask) == SrcSI) 4106 if ((c->d & SrcMask) == SrcSI)
3395 string_addr_inc(ctxt, seg_override(ctxt, ops, c), 4107 string_addr_inc(ctxt, seg_override(ctxt, c),
3396 VCPU_REGS_RSI, &c->src); 4108 VCPU_REGS_RSI, &c->src);
3397 4109
3398 if ((c->d & DstMask) == DstDI) 4110 if ((c->d & DstMask) == DstDI)
@@ -3427,115 +4139,34 @@ writeback:
3427done: 4139done:
3428 if (rc == X86EMUL_PROPAGATE_FAULT) 4140 if (rc == X86EMUL_PROPAGATE_FAULT)
3429 ctxt->have_exception = true; 4141 ctxt->have_exception = true;
4142 if (rc == X86EMUL_INTERCEPTED)
4143 return EMULATION_INTERCEPTED;
4144
3430 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4145 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3431 4146
3432twobyte_insn: 4147twobyte_insn:
3433 switch (c->b) { 4148 switch (c->b) {
3434 case 0x01: /* lgdt, lidt, lmsw */
3435 switch (c->modrm_reg) {
3436 u16 size;
3437 unsigned long address;
3438
3439 case 0: /* vmcall */
3440 if (c->modrm_mod != 3 || c->modrm_rm != 1)
3441 goto cannot_emulate;
3442
3443 rc = kvm_fix_hypercall(ctxt->vcpu);
3444 if (rc != X86EMUL_CONTINUE)
3445 goto done;
3446
3447 /* Let the processor re-execute the fixed hypercall */
3448 c->eip = ctxt->eip;
3449 /* Disable writeback. */
3450 c->dst.type = OP_NONE;
3451 break;
3452 case 2: /* lgdt */
3453 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3454 &size, &address, c->op_bytes);
3455 if (rc != X86EMUL_CONTINUE)
3456 goto done;
3457 realmode_lgdt(ctxt->vcpu, size, address);
3458 /* Disable writeback. */
3459 c->dst.type = OP_NONE;
3460 break;
3461 case 3: /* lidt/vmmcall */
3462 if (c->modrm_mod == 3) {
3463 switch (c->modrm_rm) {
3464 case 1:
3465 rc = kvm_fix_hypercall(ctxt->vcpu);
3466 break;
3467 default:
3468 goto cannot_emulate;
3469 }
3470 } else {
3471 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3472 &size, &address,
3473 c->op_bytes);
3474 if (rc != X86EMUL_CONTINUE)
3475 goto done;
3476 realmode_lidt(ctxt->vcpu, size, address);
3477 }
3478 /* Disable writeback. */
3479 c->dst.type = OP_NONE;
3480 break;
3481 case 4: /* smsw */
3482 c->dst.bytes = 2;
3483 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3484 break;
3485 case 6: /* lmsw */
3486 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
3487 (c->src.val & 0x0f), ctxt->vcpu);
3488 c->dst.type = OP_NONE;
3489 break;
3490 case 5: /* not defined */
3491 emulate_ud(ctxt);
3492 rc = X86EMUL_PROPAGATE_FAULT;
3493 goto done;
3494 case 7: /* invlpg*/
3495 emulate_invlpg(ctxt->vcpu,
3496 linear(ctxt, c->src.addr.mem));
3497 /* Disable writeback. */
3498 c->dst.type = OP_NONE;
3499 break;
3500 default:
3501 goto cannot_emulate;
3502 }
3503 break;
3504 case 0x05: /* syscall */ 4149 case 0x05: /* syscall */
3505 rc = emulate_syscall(ctxt, ops); 4150 rc = emulate_syscall(ctxt, ops);
3506 break; 4151 break;
3507 case 0x06: 4152 case 0x06:
3508 emulate_clts(ctxt->vcpu); 4153 rc = em_clts(ctxt);
3509 break; 4154 break;
3510 case 0x09: /* wbinvd */ 4155 case 0x09: /* wbinvd */
3511 kvm_emulate_wbinvd(ctxt->vcpu); 4156 (ctxt->ops->wbinvd)(ctxt);
3512 break; 4157 break;
3513 case 0x08: /* invd */ 4158 case 0x08: /* invd */
3514 case 0x0d: /* GrpP (prefetch) */ 4159 case 0x0d: /* GrpP (prefetch) */
3515 case 0x18: /* Grp16 (prefetch/nop) */ 4160 case 0x18: /* Grp16 (prefetch/nop) */
3516 break; 4161 break;
3517 case 0x20: /* mov cr, reg */ 4162 case 0x20: /* mov cr, reg */
3518 switch (c->modrm_reg) { 4163 c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
3519 case 1:
3520 case 5 ... 7:
3521 case 9 ... 15:
3522 emulate_ud(ctxt);
3523 rc = X86EMUL_PROPAGATE_FAULT;
3524 goto done;
3525 }
3526 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3527 break; 4164 break;
3528 case 0x21: /* mov from dr to reg */ 4165 case 0x21: /* mov from dr to reg */
3529 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4166 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
3530 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3531 emulate_ud(ctxt);
3532 rc = X86EMUL_PROPAGATE_FAULT;
3533 goto done;
3534 }
3535 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
3536 break; 4167 break;
3537 case 0x22: /* mov reg, cr */ 4168 case 0x22: /* mov reg, cr */
3538 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 4169 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
3539 emulate_gp(ctxt, 0); 4170 emulate_gp(ctxt, 0);
3540 rc = X86EMUL_PROPAGATE_FAULT; 4171 rc = X86EMUL_PROPAGATE_FAULT;
3541 goto done; 4172 goto done;
@@ -3543,16 +4174,9 @@ twobyte_insn:
3543 c->dst.type = OP_NONE; 4174 c->dst.type = OP_NONE;
3544 break; 4175 break;
3545 case 0x23: /* mov from reg to dr */ 4176 case 0x23: /* mov from reg to dr */
3546 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4177 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
3547 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3548 emulate_ud(ctxt);
3549 rc = X86EMUL_PROPAGATE_FAULT;
3550 goto done;
3551 }
3552
3553 if (ops->set_dr(c->modrm_reg, c->src.val &
3554 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4178 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3555 ~0ULL : ~0U), ctxt->vcpu) < 0) { 4179 ~0ULL : ~0U)) < 0) {
3556 /* #UD condition is already handled by the code above */ 4180 /* #UD condition is already handled by the code above */
3557 emulate_gp(ctxt, 0); 4181 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT; 4182 rc = X86EMUL_PROPAGATE_FAULT;
@@ -3565,7 +4189,7 @@ twobyte_insn:
3565 /* wrmsr */ 4189 /* wrmsr */
3566 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4190 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3567 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4191 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3568 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 4192 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
3569 emulate_gp(ctxt, 0); 4193 emulate_gp(ctxt, 0);
3570 rc = X86EMUL_PROPAGATE_FAULT; 4194 rc = X86EMUL_PROPAGATE_FAULT;
3571 goto done; 4195 goto done;
@@ -3574,7 +4198,7 @@ twobyte_insn:
3574 break; 4198 break;
3575 case 0x32: 4199 case 0x32:
3576 /* rdmsr */ 4200 /* rdmsr */
3577 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 4201 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
3578 emulate_gp(ctxt, 0); 4202 emulate_gp(ctxt, 0);
3579 rc = X86EMUL_PROPAGATE_FAULT; 4203 rc = X86EMUL_PROPAGATE_FAULT;
3580 goto done; 4204 goto done;
@@ -3603,7 +4227,7 @@ twobyte_insn:
3603 c->dst.val = test_cc(c->b, ctxt->eflags); 4227 c->dst.val = test_cc(c->b, ctxt->eflags);
3604 break; 4228 break;
3605 case 0xa0: /* push fs */ 4229 case 0xa0: /* push fs */
3606 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4230 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3607 break; 4231 break;
3608 case 0xa1: /* pop fs */ 4232 case 0xa1: /* pop fs */
3609 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4233 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3620,7 +4244,7 @@ twobyte_insn:
3620 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4244 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3621 break; 4245 break;
3622 case 0xa8: /* push gs */ 4246 case 0xa8: /* push gs */
3623 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4247 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3624 break; 4248 break;
3625 case 0xa9: /* pop gs */ 4249 case 0xa9: /* pop gs */
3626 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4250 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
@@ -3727,7 +4351,7 @@ twobyte_insn:
3727 (u64) c->src.val; 4351 (u64) c->src.val;
3728 break; 4352 break;
3729 case 0xc7: /* Grp9 (cmpxchg8b) */ 4353 case 0xc7: /* Grp9 (cmpxchg8b) */
3730 rc = emulate_grp9(ctxt, ops); 4354 rc = em_grp9(ctxt);
3731 break; 4355 break;
3732 default: 4356 default:
3733 goto cannot_emulate; 4357 goto cannot_emulate;
@@ -3739,5 +4363,5 @@ twobyte_insn:
3739 goto writeback; 4363 goto writeback;
3740 4364
3741cannot_emulate: 4365cannot_emulate:
3742 return -1; 4366 return EMULATION_FAILED;
3743} 4367}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48f..51a97426e791 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
33}; 33};
34 34
35struct kvm_pit { 35struct kvm_pit {
36 unsigned long base_addresss;
37 struct kvm_io_device dev; 36 struct kvm_io_device dev;
38 struct kvm_io_device speaker_dev; 37 struct kvm_io_device speaker_dev;
39 struct kvm *kvm; 38 struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
51#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 50#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
52#define KVM_PIT_CHANNEL_MASK 0x3 51#define KVM_PIT_CHANNEL_MASK 0x3
53 52
54void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
55void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
56struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
57void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ba910d149410..53e2d084bffb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
75void kvm_destroy_pic(struct kvm *kvm); 75void kvm_destroy_pic(struct kvm *kvm);
76int kvm_pic_read_irq(struct kvm *kvm); 76int kvm_pic_read_irq(struct kvm *kvm);
77void kvm_pic_update_irq(struct kvm_pic *s); 77void kvm_pic_update_irq(struct kvm_pic *s);
78void kvm_pic_clear_isr_ack(struct kvm *kvm);
79 78
80static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 79static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
81{ 80{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
100void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 99void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
101void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 100void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
102 101
103int pit_has_pending_timer(struct kvm_vcpu *vcpu);
104int apic_has_pending_timer(struct kvm_vcpu *vcpu); 102int apic_has_pending_timer(struct kvm_vcpu *vcpu);
105 103
106#endif 104#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22fae7593ee7..bd14bb4c8594 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1206 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu, 1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte, 1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq) 1209 const void *pte)
1210{ 1210{
1211 WARN_ON(1); 1211 WARN_ON(1);
1212} 1212}
@@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3163} 3163}
3164 3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp, u64 *spte,
3167 u64 *spte, 3167 const void *new)
3168 const void *new, unsigned long mmu_seq)
3169{ 3168{
3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3171 ++vcpu->kvm->stat.mmu_pde_zapped; 3170 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3173 } 3172 }
3174 3173
3175 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); 3175 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
3177} 3176}
3178 3177
3179static bool need_remote_flush(u64 old, u64 new) 3178static bool need_remote_flush(u64 old, u64 new)
@@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3229 struct kvm_mmu_page *sp; 3228 struct kvm_mmu_page *sp;
3230 struct hlist_node *node; 3229 struct hlist_node *node;
3231 LIST_HEAD(invalid_list); 3230 LIST_HEAD(invalid_list);
3232 unsigned long mmu_seq;
3233 u64 entry, gentry, *spte; 3231 u64 entry, gentry, *spte;
3234 unsigned pte_size, page_offset, misaligned, quadrant, offset; 3232 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3235 int level, npte, invlpg_counter, r, flooded = 0; 3233 int level, npte, invlpg_counter, r, flooded = 0;
@@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3271 break; 3269 break;
3272 } 3270 }
3273 3271
3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3277 spin_lock(&vcpu->kvm->mmu_lock); 3272 spin_lock(&vcpu->kvm->mmu_lock);
3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3273 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3279 gentry = 0; 3274 gentry = 0;
@@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3345 if (gentry && 3340 if (gentry &&
3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3347 & mask.word)) 3342 & mask.word))
3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, 3343 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3349 mmu_seq);
3350 if (!remote_flush && need_remote_flush(entry, *spte)) 3344 if (!remote_flush && need_remote_flush(entry, *spte))
3351 remote_flush = true; 3345 remote_flush = true;
3352 ++spte; 3346 ++spte;
@@ -3551,10 +3545,11 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3551 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3545 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3552} 3546}
3553 3547
3554static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3548static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3555{ 3549{
3556 struct kvm *kvm; 3550 struct kvm *kvm;
3557 struct kvm *kvm_freed = NULL; 3551 struct kvm *kvm_freed = NULL;
3552 int nr_to_scan = sc->nr_to_scan;
3558 3553
3559 if (nr_to_scan == 0) 3554 if (nr_to_scan == 0)
3560 goto out; 3555 goto out;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c6397795d865..6c4dc010c4cb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
79} 79}
80 80
81static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
82 gfn_t table_gfn, unsigned index, 82 pt_element_t __user *ptep_user, unsigned index,
83 pt_element_t orig_pte, pt_element_t new_pte) 83 pt_element_t orig_pte, pt_element_t new_pte)
84{ 84{
85 int npages;
85 pt_element_t ret; 86 pt_element_t ret;
86 pt_element_t *table; 87 pt_element_t *table;
87 struct page *page; 88 struct page *page;
88 89
89 page = gfn_to_page(kvm, table_gfn); 90 npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
91 /* Check if the user is doing something meaningless. */
92 if (unlikely(npages != 1))
93 return -EFAULT;
90 94
91 table = kmap_atomic(page, KM_USER0); 95 table = kmap_atomic(page, KM_USER0);
92 ret = CMPXCHG(&table[index], orig_pte, new_pte); 96 ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -117,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 gva_t addr, u32 access) 121 gva_t addr, u32 access)
118{ 122{
119 pt_element_t pte; 123 pt_element_t pte;
124 pt_element_t __user *ptep_user;
120 gfn_t table_gfn; 125 gfn_t table_gfn;
121 unsigned index, pt_access, uninitialized_var(pte_access); 126 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 127 gpa_t pte_gpa;
@@ -152,6 +157,9 @@ walk:
152 pt_access = ACC_ALL; 157 pt_access = ACC_ALL;
153 158
154 for (;;) { 159 for (;;) {
160 gfn_t real_gfn;
161 unsigned long host_addr;
162
155 index = PT_INDEX(addr, walker->level); 163 index = PT_INDEX(addr, walker->level);
156 164
157 table_gfn = gpte_to_gfn(pte); 165 table_gfn = gpte_to_gfn(pte);
@@ -160,43 +168,64 @@ walk:
160 walker->table_gfn[walker->level - 1] = table_gfn; 168 walker->table_gfn[walker->level - 1] = table_gfn;
161 walker->pte_gpa[walker->level - 1] = pte_gpa; 169 walker->pte_gpa[walker->level - 1] = pte_gpa;
162 170
163 if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, 171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
164 offset, sizeof(pte), 172 PFERR_USER_MASK|PFERR_WRITE_MASK);
165 PFERR_USER_MASK|PFERR_WRITE_MASK)) { 173 if (unlikely(real_gfn == UNMAPPED_GVA)) {
174 present = false;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn);
178
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) {
181 present = false;
182 break;
183 }
184
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
166 present = false; 187 present = false;
167 break; 188 break;
168 } 189 }
169 190
170 trace_kvm_mmu_paging_element(pte, walker->level); 191 trace_kvm_mmu_paging_element(pte, walker->level);
171 192
172 if (!is_present_gpte(pte)) { 193 if (unlikely(!is_present_gpte(pte))) {
173 present = false; 194 present = false;
174 break; 195 break;
175 } 196 }
176 197
177 if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { 198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) {
178 rsvd_fault = true; 200 rsvd_fault = true;
179 break; 201 break;
180 } 202 }
181 203
182 if (write_fault && !is_writable_pte(pte)) 204 if (unlikely(write_fault && !is_writable_pte(pte)
183 if (user_fault || is_write_protection(vcpu)) 205 && (user_fault || is_write_protection(vcpu))))
184 eperm = true; 206 eperm = true;
185 207
186 if (user_fault && !(pte & PT_USER_MASK)) 208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
187 eperm = true; 209 eperm = true;
188 210
189#if PTTYPE == 64 211#if PTTYPE == 64
190 if (fetch_fault && (pte & PT64_NX_MASK)) 212 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
191 eperm = true; 213 eperm = true;
192#endif 214#endif
193 215
194 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { 216 if (!eperm && !rsvd_fault
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret;
195 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 219 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
196 sizeof(pte)); 220 sizeof(pte));
197 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
198 index, pte, pte|PT_ACCESSED_MASK)) 222 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) {
224 present = false;
225 break;
226 } else if (ret)
199 goto walk; 227 goto walk;
228
200 mark_page_dirty(vcpu->kvm, table_gfn); 229 mark_page_dirty(vcpu->kvm, table_gfn);
201 pte |= PT_ACCESSED_MASK; 230 pte |= PT_ACCESSED_MASK;
202 } 231 }
@@ -241,17 +270,21 @@ walk:
241 --walker->level; 270 --walker->level;
242 } 271 }
243 272
244 if (!present || eperm || rsvd_fault) 273 if (unlikely(!present || eperm || rsvd_fault))
245 goto error; 274 goto error;
246 275
247 if (write_fault && !is_dirty_gpte(pte)) { 276 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
248 bool ret; 277 int ret;
249 278
250 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
251 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
252 pte|PT_DIRTY_MASK); 281 pte, pte|PT_DIRTY_MASK);
253 if (ret) 282 if (unlikely(ret < 0)) {
283 present = false;
284 goto error;
285 } else if (ret)
254 goto walk; 286 goto walk;
287
255 mark_page_dirty(vcpu->kvm, table_gfn); 288 mark_page_dirty(vcpu->kvm, table_gfn);
256 pte |= PT_DIRTY_MASK; 289 pte |= PT_DIRTY_MASK;
257 walker->ptes[walker->level - 1] = pte; 290 walker->ptes[walker->level - 1] = pte;
@@ -325,7 +358,7 @@ no_present:
325} 358}
326 359
327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 360static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
328 u64 *spte, const void *pte, unsigned long mmu_seq) 361 u64 *spte, const void *pte)
329{ 362{
330 pt_element_t gpte; 363 pt_element_t gpte;
331 unsigned pte_access; 364 unsigned pte_access;
@@ -342,8 +375,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
342 kvm_release_pfn_clean(pfn); 375 kvm_release_pfn_clean(pfn);
343 return; 376 return;
344 } 377 }
345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 378
348 /* 379 /*
349 * we call mmu_set_spte() with host_writable = true because that 380 * we call mmu_set_spte() with host_writable = true because that
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6bb15d583e47..506e4fe23adc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -63,6 +63,10 @@ MODULE_LICENSE("GPL");
63 63
64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
65 65
66#define TSC_RATIO_RSVD 0xffffff0000000000ULL
67#define TSC_RATIO_MIN 0x0000000000000001ULL
68#define TSC_RATIO_MAX 0x000000ffffffffffULL
69
66static bool erratum_383_found __read_mostly; 70static bool erratum_383_found __read_mostly;
67 71
68static const u32 host_save_user_msrs[] = { 72static const u32 host_save_user_msrs[] = {
@@ -93,14 +97,6 @@ struct nested_state {
93 /* A VMEXIT is required but not yet emulated */ 97 /* A VMEXIT is required but not yet emulated */
94 bool exit_required; 98 bool exit_required;
95 99
96 /*
97 * If we vmexit during an instruction emulation we need this to restore
98 * the l1 guest rip after the emulation
99 */
100 unsigned long vmexit_rip;
101 unsigned long vmexit_rsp;
102 unsigned long vmexit_rax;
103
104 /* cache for intercepts of the guest */ 100 /* cache for intercepts of the guest */
105 u32 intercept_cr; 101 u32 intercept_cr;
106 u32 intercept_dr; 102 u32 intercept_dr;
@@ -144,8 +140,13 @@ struct vcpu_svm {
144 unsigned int3_injected; 140 unsigned int3_injected;
145 unsigned long int3_rip; 141 unsigned long int3_rip;
146 u32 apf_reason; 142 u32 apf_reason;
143
144 u64 tsc_ratio;
147}; 145};
148 146
147static DEFINE_PER_CPU(u64, current_tsc_ratio);
148#define TSC_RATIO_DEFAULT 0x0100000000ULL
149
149#define MSR_INVALID 0xffffffffU 150#define MSR_INVALID 0xffffffffU
150 151
151static struct svm_direct_access_msrs { 152static struct svm_direct_access_msrs {
@@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
190static int nested_svm_vmexit(struct vcpu_svm *svm); 191static int nested_svm_vmexit(struct vcpu_svm *svm);
191static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 192static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
192 bool has_error_code, u32 error_code); 193 bool has_error_code, u32 error_code);
194static u64 __scale_tsc(u64 ratio, u64 tsc);
193 195
194enum { 196enum {
195 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, 197 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -376,7 +378,6 @@ struct svm_cpu_data {
376}; 378};
377 379
378static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 380static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
379static uint32_t svm_features;
380 381
381struct svm_init_data { 382struct svm_init_data {
382 int cpu; 383 int cpu;
@@ -569,6 +570,10 @@ static int has_svm(void)
569 570
570static void svm_hardware_disable(void *garbage) 571static void svm_hardware_disable(void *garbage)
571{ 572{
573 /* Make sure we clean up behind us */
574 if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
575 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
576
572 cpu_svm_disable(); 577 cpu_svm_disable();
573} 578}
574 579
@@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
610 615
611 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 616 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
612 617
618 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
619 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
620 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
621 }
622
613 svm_init_erratum_383(); 623 svm_init_erratum_383();
614 624
615 return 0; 625 return 0;
@@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void)
791 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 801 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
792 kvm_enable_efer_bits(EFER_FFXSR); 802 kvm_enable_efer_bits(EFER_FFXSR);
793 803
804 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
805 u64 max;
806
807 kvm_has_tsc_control = true;
808
809 /*
810 * Make sure the user can only configure tsc_khz values that
811 * fit into a signed integer.
812 * A min value is not calculated needed because it will always
813 * be 1 on all machines and a value of 0 is used to disable
814 * tsc-scaling for the vcpu.
815 */
816 max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
817
818 kvm_max_guest_tsc_khz = max;
819 }
820
794 if (nested) { 821 if (nested) {
795 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 822 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
796 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 823 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void)
802 goto err; 829 goto err;
803 } 830 }
804 831
805 svm_features = cpuid_edx(SVM_CPUID_FUNC);
806
807 if (!boot_cpu_has(X86_FEATURE_NPT)) 832 if (!boot_cpu_has(X86_FEATURE_NPT))
808 npt_enabled = false; 833 npt_enabled = false;
809 834
@@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
854 seg->base = 0; 879 seg->base = 0;
855} 880}
856 881
882static u64 __scale_tsc(u64 ratio, u64 tsc)
883{
884 u64 mult, frac, _tsc;
885
886 mult = ratio >> 32;
887 frac = ratio & ((1ULL << 32) - 1);
888
889 _tsc = tsc;
890 _tsc *= mult;
891 _tsc += (tsc >> 32) * frac;
892 _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
893
894 return _tsc;
895}
896
897static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900 u64 _tsc = tsc;
901
902 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
903 _tsc = __scale_tsc(svm->tsc_ratio, tsc);
904
905 return _tsc;
906}
907
908static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
909{
910 struct vcpu_svm *svm = to_svm(vcpu);
911 u64 ratio;
912 u64 khz;
913
914 /* TSC scaling supported? */
915 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
916 return;
917
918 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
919 if (user_tsc_khz == 0) {
920 vcpu->arch.virtual_tsc_khz = 0;
921 svm->tsc_ratio = TSC_RATIO_DEFAULT;
922 return;
923 }
924
925 khz = user_tsc_khz;
926
927 /* TSC scaling required - calculate ratio */
928 ratio = khz << 32;
929 do_div(ratio, tsc_khz);
930
931 if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
932 WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
933 user_tsc_khz);
934 return;
935 }
936 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
937 svm->tsc_ratio = ratio;
938}
939
857static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 940static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
858{ 941{
859 struct vcpu_svm *svm = to_svm(vcpu); 942 struct vcpu_svm *svm = to_svm(vcpu);
@@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
880 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 963 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
881} 964}
882 965
966static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
967{
968 u64 tsc;
969
970 tsc = svm_scale_tsc(vcpu, native_read_tsc());
971
972 return target_tsc - tsc;
973}
974
883static void init_vmcb(struct vcpu_svm *svm) 975static void init_vmcb(struct vcpu_svm *svm)
884{ 976{
885 struct vmcb_control_area *control = &svm->vmcb->control; 977 struct vmcb_control_area *control = &svm->vmcb->control;
@@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm)
975 svm_set_efer(&svm->vcpu, 0); 1067 svm_set_efer(&svm->vcpu, 0);
976 save->dr6 = 0xffff0ff0; 1068 save->dr6 = 0xffff0ff0;
977 save->dr7 = 0x400; 1069 save->dr7 = 0x400;
978 save->rflags = 2; 1070 kvm_set_rflags(&svm->vcpu, 2);
979 save->rip = 0x0000fff0; 1071 save->rip = 0x0000fff0;
980 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1072 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
981 1073
@@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1048 goto out; 1140 goto out;
1049 } 1141 }
1050 1142
1143 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1144
1051 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 1145 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1052 if (err) 1146 if (err)
1053 goto free_svm; 1147 goto free_svm;
@@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1141 1235
1142 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1236 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1143 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1237 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1238
1239 if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1240 svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
1241 __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
1242 wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1243 }
1144} 1244}
1145 1245
1146static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1246static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1365{ 1465{
1366 struct vcpu_svm *svm = to_svm(vcpu); 1466 struct vcpu_svm *svm = to_svm(vcpu);
1367 1467
1368 if (is_guest_mode(vcpu)) {
1369 /*
1370 * We are here because we run in nested mode, the host kvm
1371 * intercepts cr0 writes but the l1 hypervisor does not.
1372 * But the L1 hypervisor may intercept selective cr0 writes.
1373 * This needs to be checked here.
1374 */
1375 unsigned long old, new;
1376
1377 /* Remove bits that would trigger a real cr0 write intercept */
1378 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1379 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1380
1381 if (old == new) {
1382 /* cr0 write with ts and mp unchanged */
1383 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1384 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1385 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1386 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1387 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1388 return;
1389 }
1390 }
1391 }
1392
1393#ifdef CONFIG_X86_64 1468#ifdef CONFIG_X86_64
1394 if (vcpu->arch.efer & EFER_LME) { 1469 if (vcpu->arch.efer & EFER_LME) {
1395 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1470 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2127 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); 2202 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
2128 nested_vmcb->save.cr2 = vmcb->save.cr2; 2203 nested_vmcb->save.cr2 = vmcb->save.cr2;
2129 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2204 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
2130 nested_vmcb->save.rflags = vmcb->save.rflags; 2205 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2131 nested_vmcb->save.rip = vmcb->save.rip; 2206 nested_vmcb->save.rip = vmcb->save.rip;
2132 nested_vmcb->save.rsp = vmcb->save.rsp; 2207 nested_vmcb->save.rsp = vmcb->save.rsp;
2133 nested_vmcb->save.rax = vmcb->save.rax; 2208 nested_vmcb->save.rax = vmcb->save.rax;
@@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2184 svm->vmcb->save.ds = hsave->save.ds; 2259 svm->vmcb->save.ds = hsave->save.ds;
2185 svm->vmcb->save.gdtr = hsave->save.gdtr; 2260 svm->vmcb->save.gdtr = hsave->save.gdtr;
2186 svm->vmcb->save.idtr = hsave->save.idtr; 2261 svm->vmcb->save.idtr = hsave->save.idtr;
2187 svm->vmcb->save.rflags = hsave->save.rflags; 2262 kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2188 svm_set_efer(&svm->vcpu, hsave->save.efer); 2263 svm_set_efer(&svm->vcpu, hsave->save.efer);
2189 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 2264 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2190 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 2265 svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2312 hsave->save.efer = svm->vcpu.arch.efer; 2387 hsave->save.efer = svm->vcpu.arch.efer;
2313 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2388 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2314 hsave->save.cr4 = svm->vcpu.arch.cr4; 2389 hsave->save.cr4 = svm->vcpu.arch.cr4;
2315 hsave->save.rflags = vmcb->save.rflags; 2390 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2316 hsave->save.rip = kvm_rip_read(&svm->vcpu); 2391 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2317 hsave->save.rsp = vmcb->save.rsp; 2392 hsave->save.rsp = vmcb->save.rsp;
2318 hsave->save.rax = vmcb->save.rax; 2393 hsave->save.rax = vmcb->save.rax;
@@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2323 2398
2324 copy_vmcb_control_area(hsave, vmcb); 2399 copy_vmcb_control_area(hsave, vmcb);
2325 2400
2326 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 2401 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2327 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2402 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2328 else 2403 else
2329 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2404 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
@@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2341 svm->vmcb->save.ds = nested_vmcb->save.ds; 2416 svm->vmcb->save.ds = nested_vmcb->save.ds;
2342 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 2417 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2343 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 2418 svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2344 svm->vmcb->save.rflags = nested_vmcb->save.rflags; 2419 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2345 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 2420 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2346 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 2421 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2347 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 2422 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
2443 if (nested_svm_check_permissions(svm)) 2518 if (nested_svm_check_permissions(svm))
2444 return 1; 2519 return 1;
2445 2520
2446 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2447 skip_emulated_instruction(&svm->vcpu);
2448
2449 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2521 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2450 if (!nested_vmcb) 2522 if (!nested_vmcb)
2451 return 1; 2523 return 1;
2452 2524
2525 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2526 skip_emulated_instruction(&svm->vcpu);
2527
2453 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2528 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2454 nested_svm_unmap(page); 2529 nested_svm_unmap(page);
2455 2530
@@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
2464 if (nested_svm_check_permissions(svm)) 2539 if (nested_svm_check_permissions(svm))
2465 return 1; 2540 return 1;
2466 2541
2467 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2468 skip_emulated_instruction(&svm->vcpu);
2469
2470 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2542 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2471 if (!nested_vmcb) 2543 if (!nested_vmcb)
2472 return 1; 2544 return 1;
2473 2545
2546 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2547 skip_emulated_instruction(&svm->vcpu);
2548
2474 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2549 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2475 nested_svm_unmap(page); 2550 nested_svm_unmap(page);
2476 2551
@@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2676 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 2751 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2677} 2752}
2678 2753
2754bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2755{
2756 unsigned long cr0 = svm->vcpu.arch.cr0;
2757 bool ret = false;
2758 u64 intercept;
2759
2760 intercept = svm->nested.intercept;
2761
2762 if (!is_guest_mode(&svm->vcpu) ||
2763 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2764 return false;
2765
2766 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2767 val &= ~SVM_CR0_SELECTIVE_MASK;
2768
2769 if (cr0 ^ val) {
2770 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2771 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2772 }
2773
2774 return ret;
2775}
2776
2679#define CR_VALID (1ULL << 63) 2777#define CR_VALID (1ULL << 63)
2680 2778
2681static int cr_interception(struct vcpu_svm *svm) 2779static int cr_interception(struct vcpu_svm *svm)
@@ -2699,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm)
2699 val = kvm_register_read(&svm->vcpu, reg); 2797 val = kvm_register_read(&svm->vcpu, reg);
2700 switch (cr) { 2798 switch (cr) {
2701 case 0: 2799 case 0:
2702 err = kvm_set_cr0(&svm->vcpu, val); 2800 if (!check_selective_cr0_intercepted(svm, val))
2801 err = kvm_set_cr0(&svm->vcpu, val);
2802 else
2803 return 1;
2804
2703 break; 2805 break;
2704 case 3: 2806 case 3:
2705 err = kvm_set_cr3(&svm->vcpu, val); 2807 err = kvm_set_cr3(&svm->vcpu, val);
@@ -2744,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm)
2744 return 1; 2846 return 1;
2745} 2847}
2746 2848
2747static int cr0_write_interception(struct vcpu_svm *svm)
2748{
2749 struct kvm_vcpu *vcpu = &svm->vcpu;
2750 int r;
2751
2752 r = cr_interception(svm);
2753
2754 if (svm->nested.vmexit_rip) {
2755 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2756 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2757 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2758 svm->nested.vmexit_rip = 0;
2759 }
2760
2761 return r;
2762}
2763
2764static int dr_interception(struct vcpu_svm *svm) 2849static int dr_interception(struct vcpu_svm *svm)
2765{ 2850{
2766 int reg, dr; 2851 int reg, dr;
@@ -2813,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2813 case MSR_IA32_TSC: { 2898 case MSR_IA32_TSC: {
2814 struct vmcb *vmcb = get_host_vmcb(svm); 2899 struct vmcb *vmcb = get_host_vmcb(svm);
2815 2900
2816 *data = vmcb->control.tsc_offset + native_read_tsc(); 2901 *data = vmcb->control.tsc_offset +
2902 svm_scale_tsc(vcpu, native_read_tsc());
2903
2817 break; 2904 break;
2818 } 2905 }
2819 case MSR_STAR: 2906 case MSR_STAR:
@@ -3048,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3048 [SVM_EXIT_READ_CR4] = cr_interception, 3135 [SVM_EXIT_READ_CR4] = cr_interception,
3049 [SVM_EXIT_READ_CR8] = cr_interception, 3136 [SVM_EXIT_READ_CR8] = cr_interception,
3050 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3137 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
3051 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3138 [SVM_EXIT_WRITE_CR0] = cr_interception,
3052 [SVM_EXIT_WRITE_CR3] = cr_interception, 3139 [SVM_EXIT_WRITE_CR3] = cr_interception,
3053 [SVM_EXIT_WRITE_CR4] = cr_interception, 3140 [SVM_EXIT_WRITE_CR4] = cr_interception,
3054 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3141 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -3104,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3104 [SVM_EXIT_NPF] = pf_interception, 3191 [SVM_EXIT_NPF] = pf_interception,
3105}; 3192};
3106 3193
3107void dump_vmcb(struct kvm_vcpu *vcpu) 3194static void dump_vmcb(struct kvm_vcpu *vcpu)
3108{ 3195{
3109 struct vcpu_svm *svm = to_svm(vcpu); 3196 struct vcpu_svm *svm = to_svm(vcpu);
3110 struct vmcb_control_area *control = &svm->vmcb->control; 3197 struct vmcb_control_area *control = &svm->vmcb->control;
3111 struct vmcb_save_area *save = &svm->vmcb->save; 3198 struct vmcb_save_area *save = &svm->vmcb->save;
3112 3199
3113 pr_err("VMCB Control Area:\n"); 3200 pr_err("VMCB Control Area:\n");
3114 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); 3201 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3115 pr_err("cr_write: %04x\n", control->intercept_cr >> 16); 3202 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3116 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); 3203 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3117 pr_err("dr_write: %04x\n", control->intercept_dr >> 16); 3204 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3118 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3205 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3119 pr_err("intercepts: %016llx\n", control->intercept); 3206 pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3120 pr_err("pause filter count: %d\n", control->pause_filter_count); 3207 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3121 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); 3208 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3122 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); 3209 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3123 pr_err("tsc_offset: %016llx\n", control->tsc_offset); 3210 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3124 pr_err("asid: %d\n", control->asid); 3211 pr_err("%-20s%d\n", "asid:", control->asid);
3125 pr_err("tlb_ctl: %d\n", control->tlb_ctl); 3212 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3126 pr_err("int_ctl: %08x\n", control->int_ctl); 3213 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3127 pr_err("int_vector: %08x\n", control->int_vector); 3214 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3128 pr_err("int_state: %08x\n", control->int_state); 3215 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3129 pr_err("exit_code: %08x\n", control->exit_code); 3216 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3130 pr_err("exit_info1: %016llx\n", control->exit_info_1); 3217 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3131 pr_err("exit_info2: %016llx\n", control->exit_info_2); 3218 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3132 pr_err("exit_int_info: %08x\n", control->exit_int_info); 3219 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3133 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); 3220 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3134 pr_err("nested_ctl: %lld\n", control->nested_ctl); 3221 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3135 pr_err("nested_cr3: %016llx\n", control->nested_cr3); 3222 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3136 pr_err("event_inj: %08x\n", control->event_inj); 3223 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3137 pr_err("event_inj_err: %08x\n", control->event_inj_err); 3224 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3138 pr_err("lbr_ctl: %lld\n", control->lbr_ctl); 3225 pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3139 pr_err("next_rip: %016llx\n", control->next_rip); 3226 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3140 pr_err("VMCB State Save Area:\n"); 3227 pr_err("VMCB State Save Area:\n");
3141 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", 3228 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3142 save->es.selector, save->es.attrib, 3229 "es:",
3143 save->es.limit, save->es.base); 3230 save->es.selector, save->es.attrib,
3144 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", 3231 save->es.limit, save->es.base);
3145 save->cs.selector, save->cs.attrib, 3232 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3146 save->cs.limit, save->cs.base); 3233 "cs:",
3147 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", 3234 save->cs.selector, save->cs.attrib,
3148 save->ss.selector, save->ss.attrib, 3235 save->cs.limit, save->cs.base);
3149 save->ss.limit, save->ss.base); 3236 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3150 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", 3237 "ss:",
3151 save->ds.selector, save->ds.attrib, 3238 save->ss.selector, save->ss.attrib,
3152 save->ds.limit, save->ds.base); 3239 save->ss.limit, save->ss.base);
3153 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", 3240 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3154 save->fs.selector, save->fs.attrib, 3241 "ds:",
3155 save->fs.limit, save->fs.base); 3242 save->ds.selector, save->ds.attrib,
3156 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", 3243 save->ds.limit, save->ds.base);
3157 save->gs.selector, save->gs.attrib, 3244 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3158 save->gs.limit, save->gs.base); 3245 "fs:",
3159 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", 3246 save->fs.selector, save->fs.attrib,
3160 save->gdtr.selector, save->gdtr.attrib, 3247 save->fs.limit, save->fs.base);
3161 save->gdtr.limit, save->gdtr.base); 3248 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3162 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", 3249 "gs:",
3163 save->ldtr.selector, save->ldtr.attrib, 3250 save->gs.selector, save->gs.attrib,
3164 save->ldtr.limit, save->ldtr.base); 3251 save->gs.limit, save->gs.base);
3165 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", 3252 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3166 save->idtr.selector, save->idtr.attrib, 3253 "gdtr:",
3167 save->idtr.limit, save->idtr.base); 3254 save->gdtr.selector, save->gdtr.attrib,
3168 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", 3255 save->gdtr.limit, save->gdtr.base);
3169 save->tr.selector, save->tr.attrib, 3256 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3170 save->tr.limit, save->tr.base); 3257 "ldtr:",
3258 save->ldtr.selector, save->ldtr.attrib,
3259 save->ldtr.limit, save->ldtr.base);
3260 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3261 "idtr:",
3262 save->idtr.selector, save->idtr.attrib,
3263 save->idtr.limit, save->idtr.base);
3264 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3265 "tr:",
3266 save->tr.selector, save->tr.attrib,
3267 save->tr.limit, save->tr.base);
3171 pr_err("cpl: %d efer: %016llx\n", 3268 pr_err("cpl: %d efer: %016llx\n",
3172 save->cpl, save->efer); 3269 save->cpl, save->efer);
3173 pr_err("cr0: %016llx cr2: %016llx\n", 3270 pr_err("%-15s %016llx %-13s %016llx\n",
3174 save->cr0, save->cr2); 3271 "cr0:", save->cr0, "cr2:", save->cr2);
3175 pr_err("cr3: %016llx cr4: %016llx\n", 3272 pr_err("%-15s %016llx %-13s %016llx\n",
3176 save->cr3, save->cr4); 3273 "cr3:", save->cr3, "cr4:", save->cr4);
3177 pr_err("dr6: %016llx dr7: %016llx\n", 3274 pr_err("%-15s %016llx %-13s %016llx\n",
3178 save->dr6, save->dr7); 3275 "dr6:", save->dr6, "dr7:", save->dr7);
3179 pr_err("rip: %016llx rflags: %016llx\n", 3276 pr_err("%-15s %016llx %-13s %016llx\n",
3180 save->rip, save->rflags); 3277 "rip:", save->rip, "rflags:", save->rflags);
3181 pr_err("rsp: %016llx rax: %016llx\n", 3278 pr_err("%-15s %016llx %-13s %016llx\n",
3182 save->rsp, save->rax); 3279 "rsp:", save->rsp, "rax:", save->rax);
3183 pr_err("star: %016llx lstar: %016llx\n", 3280 pr_err("%-15s %016llx %-13s %016llx\n",
3184 save->star, save->lstar); 3281 "star:", save->star, "lstar:", save->lstar);
3185 pr_err("cstar: %016llx sfmask: %016llx\n", 3282 pr_err("%-15s %016llx %-13s %016llx\n",
3186 save->cstar, save->sfmask); 3283 "cstar:", save->cstar, "sfmask:", save->sfmask);
3187 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", 3284 pr_err("%-15s %016llx %-13s %016llx\n",
3188 save->kernel_gs_base, save->sysenter_cs); 3285 "kernel_gs_base:", save->kernel_gs_base,
3189 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", 3286 "sysenter_cs:", save->sysenter_cs);
3190 save->sysenter_esp, save->sysenter_eip); 3287 pr_err("%-15s %016llx %-13s %016llx\n",
3191 pr_err("gpat: %016llx dbgctl: %016llx\n", 3288 "sysenter_esp:", save->sysenter_esp,
3192 save->g_pat, save->dbgctl); 3289 "sysenter_eip:", save->sysenter_eip);
3193 pr_err("br_from: %016llx br_to: %016llx\n", 3290 pr_err("%-15s %016llx %-13s %016llx\n",
3194 save->br_from, save->br_to); 3291 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3195 pr_err("excp_from: %016llx excp_to: %016llx\n", 3292 pr_err("%-15s %016llx %-13s %016llx\n",
3196 save->last_excp_from, save->last_excp_to); 3293 "br_from:", save->br_from, "br_to:", save->br_to);
3197 3294 pr_err("%-15s %016llx %-13s %016llx\n",
3295 "excp_from:", save->last_excp_from,
3296 "excp_to:", save->last_excp_to);
3198} 3297}
3199 3298
3200static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 3299static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
@@ -3384,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3384 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 3483 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3385 return 0; 3484 return 0;
3386 3485
3387 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3486 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3388 3487
3389 if (is_guest_mode(vcpu)) 3488 if (is_guest_mode(vcpu))
3390 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3489 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
@@ -3871,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3871 update_cr0_intercept(svm); 3970 update_cr0_intercept(svm);
3872} 3971}
3873 3972
3973#define PRE_EX(exit) { .exit_code = (exit), \
3974 .stage = X86_ICPT_PRE_EXCEPT, }
3975#define POST_EX(exit) { .exit_code = (exit), \
3976 .stage = X86_ICPT_POST_EXCEPT, }
3977#define POST_MEM(exit) { .exit_code = (exit), \
3978 .stage = X86_ICPT_POST_MEMACCESS, }
3979
3980static struct __x86_intercept {
3981 u32 exit_code;
3982 enum x86_intercept_stage stage;
3983} x86_intercept_map[] = {
3984 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
3985 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
3986 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
3987 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
3988 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3989 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
3990 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
3991 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
3992 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
3993 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
3994 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
3995 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
3996 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
3997 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
3998 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
3999 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4000 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4001 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4002 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4003 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4004 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4005 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4006 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4007 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4008 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4009 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4010 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4011 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4012 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4013 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4014 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4015 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4016 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4017 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4018 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4019 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4020 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4021 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4022 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4023 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4024 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4025 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4026 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4027 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4028 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4029 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4030};
4031
4032#undef PRE_EX
4033#undef POST_EX
4034#undef POST_MEM
4035
4036static int svm_check_intercept(struct kvm_vcpu *vcpu,
4037 struct x86_instruction_info *info,
4038 enum x86_intercept_stage stage)
4039{
4040 struct vcpu_svm *svm = to_svm(vcpu);
4041 int vmexit, ret = X86EMUL_CONTINUE;
4042 struct __x86_intercept icpt_info;
4043 struct vmcb *vmcb = svm->vmcb;
4044
4045 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4046 goto out;
4047
4048 icpt_info = x86_intercept_map[info->intercept];
4049
4050 if (stage != icpt_info.stage)
4051 goto out;
4052
4053 switch (icpt_info.exit_code) {
4054 case SVM_EXIT_READ_CR0:
4055 if (info->intercept == x86_intercept_cr_read)
4056 icpt_info.exit_code += info->modrm_reg;
4057 break;
4058 case SVM_EXIT_WRITE_CR0: {
4059 unsigned long cr0, val;
4060 u64 intercept;
4061
4062 if (info->intercept == x86_intercept_cr_write)
4063 icpt_info.exit_code += info->modrm_reg;
4064
4065 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
4066 break;
4067
4068 intercept = svm->nested.intercept;
4069
4070 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4071 break;
4072
4073 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4074 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4075
4076 if (info->intercept == x86_intercept_lmsw) {
4077 cr0 &= 0xfUL;
4078 val &= 0xfUL;
4079 /* lmsw can't clear PE - catch this here */
4080 if (cr0 & X86_CR0_PE)
4081 val |= X86_CR0_PE;
4082 }
4083
4084 if (cr0 ^ val)
4085 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4086
4087 break;
4088 }
4089 case SVM_EXIT_READ_DR0:
4090 case SVM_EXIT_WRITE_DR0:
4091 icpt_info.exit_code += info->modrm_reg;
4092 break;
4093 case SVM_EXIT_MSR:
4094 if (info->intercept == x86_intercept_wrmsr)
4095 vmcb->control.exit_info_1 = 1;
4096 else
4097 vmcb->control.exit_info_1 = 0;
4098 break;
4099 case SVM_EXIT_PAUSE:
4100 /*
4101 * We get this for NOP only, but pause
4102 * is rep not, check this here
4103 */
4104 if (info->rep_prefix != REPE_PREFIX)
4105 goto out;
4106 case SVM_EXIT_IOIO: {
4107 u64 exit_info;
4108 u32 bytes;
4109
4110 exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
4111
4112 if (info->intercept == x86_intercept_in ||
4113 info->intercept == x86_intercept_ins) {
4114 exit_info |= SVM_IOIO_TYPE_MASK;
4115 bytes = info->src_bytes;
4116 } else {
4117 bytes = info->dst_bytes;
4118 }
4119
4120 if (info->intercept == x86_intercept_outs ||
4121 info->intercept == x86_intercept_ins)
4122 exit_info |= SVM_IOIO_STR_MASK;
4123
4124 if (info->rep_prefix)
4125 exit_info |= SVM_IOIO_REP_MASK;
4126
4127 bytes = min(bytes, 4u);
4128
4129 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4130
4131 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4132
4133 vmcb->control.exit_info_1 = exit_info;
4134 vmcb->control.exit_info_2 = info->next_rip;
4135
4136 break;
4137 }
4138 default:
4139 break;
4140 }
4141
4142 vmcb->control.next_rip = info->next_rip;
4143 vmcb->control.exit_code = icpt_info.exit_code;
4144 vmexit = nested_svm_exit_handled(svm);
4145
4146 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4147 : X86EMUL_CONTINUE;
4148
4149out:
4150 return ret;
4151}
4152
3874static struct kvm_x86_ops svm_x86_ops = { 4153static struct kvm_x86_ops svm_x86_ops = {
3875 .cpu_has_kvm_support = has_svm, 4154 .cpu_has_kvm_support = has_svm,
3876 .disabled_by_bios = is_disabled, 4155 .disabled_by_bios = is_disabled,
@@ -3952,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = {
3952 4231
3953 .has_wbinvd_exit = svm_has_wbinvd_exit, 4232 .has_wbinvd_exit = svm_has_wbinvd_exit,
3954 4233
4234 .set_tsc_khz = svm_set_tsc_khz,
3955 .write_tsc_offset = svm_write_tsc_offset, 4235 .write_tsc_offset = svm_write_tsc_offset,
3956 .adjust_tsc_offset = svm_adjust_tsc_offset, 4236 .adjust_tsc_offset = svm_adjust_tsc_offset,
4237 .compute_tsc_offset = svm_compute_tsc_offset,
3957 4238
3958 .set_tdp_cr3 = set_tdp_cr3, 4239 .set_tdp_cr3 = set_tdp_cr3,
4240
4241 .check_intercept = svm_check_intercept,
3959}; 4242};
3960 4243
3961static int __init svm_init(void) 4244static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b4cdcbd154c..4c3fa0f67469 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -128,8 +128,11 @@ struct vcpu_vmx {
128 unsigned long host_rsp; 128 unsigned long host_rsp;
129 int launched; 129 int launched;
130 u8 fail; 130 u8 fail;
131 u8 cpl;
132 bool nmi_known_unmasked;
131 u32 exit_intr_info; 133 u32 exit_intr_info;
132 u32 idt_vectoring_info; 134 u32 idt_vectoring_info;
135 ulong rflags;
133 struct shared_msr_entry *guest_msrs; 136 struct shared_msr_entry *guest_msrs;
134 int nmsrs; 137 int nmsrs;
135 int save_nmsrs; 138 int save_nmsrs;
@@ -159,6 +162,10 @@ struct vcpu_vmx {
159 u32 ar; 162 u32 ar;
160 } tr, es, ds, fs, gs; 163 } tr, es, ds, fs, gs;
161 } rmode; 164 } rmode;
165 struct {
166 u32 bitmask; /* 4 bits per segment (1 bit per field) */
167 struct kvm_save_segment seg[8];
168 } segment_cache;
162 int vpid; 169 int vpid;
163 bool emulation_required; 170 bool emulation_required;
164 171
@@ -171,6 +178,15 @@ struct vcpu_vmx {
171 bool rdtscp_enabled; 178 bool rdtscp_enabled;
172}; 179};
173 180
181enum segment_cache_field {
182 SEG_FIELD_SEL = 0,
183 SEG_FIELD_BASE = 1,
184 SEG_FIELD_LIMIT = 2,
185 SEG_FIELD_AR = 3,
186
187 SEG_FIELD_NR = 4
188};
189
174static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 190static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
175{ 191{
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 192 return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
643 vmcs_writel(field, vmcs_readl(field) | mask); 659 vmcs_writel(field, vmcs_readl(field) | mask);
644} 660}
645 661
662static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
663{
664 vmx->segment_cache.bitmask = 0;
665}
666
667static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
668 unsigned field)
669{
670 bool ret;
671 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
672
673 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
674 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
675 vmx->segment_cache.bitmask = 0;
676 }
677 ret = vmx->segment_cache.bitmask & mask;
678 vmx->segment_cache.bitmask |= mask;
679 return ret;
680}
681
682static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
683{
684 u16 *p = &vmx->segment_cache.seg[seg].selector;
685
686 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
687 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
688 return *p;
689}
690
691static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
692{
693 ulong *p = &vmx->segment_cache.seg[seg].base;
694
695 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
696 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
697 return *p;
698}
699
700static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
701{
702 u32 *p = &vmx->segment_cache.seg[seg].limit;
703
704 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
705 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
706 return *p;
707}
708
709static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
710{
711 u32 *p = &vmx->segment_cache.seg[seg].ar;
712
713 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
714 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
715 return *p;
716}
717
646static void update_exception_bitmap(struct kvm_vcpu *vcpu) 718static void update_exception_bitmap(struct kvm_vcpu *vcpu)
647{ 719{
648 u32 eb; 720 u32 eb;
@@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
970{ 1042{
971 unsigned long rflags, save_rflags; 1043 unsigned long rflags, save_rflags;
972 1044
973 rflags = vmcs_readl(GUEST_RFLAGS); 1045 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
974 if (to_vmx(vcpu)->rmode.vm86_active) { 1046 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
975 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1047 rflags = vmcs_readl(GUEST_RFLAGS);
976 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1048 if (to_vmx(vcpu)->rmode.vm86_active) {
977 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1049 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1050 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1051 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1052 }
1053 to_vmx(vcpu)->rflags = rflags;
978 } 1054 }
979 return rflags; 1055 return to_vmx(vcpu)->rflags;
980} 1056}
981 1057
982static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
983{ 1059{
1060 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1061 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1062 to_vmx(vcpu)->rflags = rflags;
984 if (to_vmx(vcpu)->rmode.vm86_active) { 1063 if (to_vmx(vcpu)->rmode.vm86_active) {
985 to_vmx(vcpu)->rmode.save_rflags = rflags; 1064 to_vmx(vcpu)->rmode.save_rflags = rflags;
986 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1065 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1053 } 1132 }
1054 1133
1055 if (vmx->rmode.vm86_active) { 1134 if (vmx->rmode.vm86_active) {
1056 if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) 1135 int inc_eip = 0;
1136 if (kvm_exception_is_soft(nr))
1137 inc_eip = vcpu->arch.event_exit_inst_len;
1138 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1057 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1139 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1058 return; 1140 return;
1059 } 1141 }
@@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void)
1151} 1233}
1152 1234
1153/* 1235/*
1236 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1237 * ioctl. In this case the call-back should update internal vmx state to make
1238 * the changes effective.
1239 */
1240static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1241{
1242 /* Nothing to do here */
1243}
1244
1245/*
1154 * writes 'offset' into guest's timestamp counter offset register 1246 * writes 'offset' into guest's timestamp counter offset register
1155 */ 1247 */
1156static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1164 vmcs_write64(TSC_OFFSET, offset + adjustment); 1256 vmcs_write64(TSC_OFFSET, offset + adjustment);
1165} 1257}
1166 1258
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1260{
1261 return target_tsc - native_read_tsc();
1262}
1263
1167/* 1264/*
1168 * Reads an msr value (of 'msr_index') into 'pdata'. 1265 * Reads an msr value (of 'msr_index') into 'pdata'.
1169 * Returns 0 on success, non-0 otherwise. 1266 * Returns 0 on success, non-0 otherwise.
@@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1243 break; 1340 break;
1244#ifdef CONFIG_X86_64 1341#ifdef CONFIG_X86_64
1245 case MSR_FS_BASE: 1342 case MSR_FS_BASE:
1343 vmx_segment_cache_clear(vmx);
1246 vmcs_writel(GUEST_FS_BASE, data); 1344 vmcs_writel(GUEST_FS_BASE, data);
1247 break; 1345 break;
1248 case MSR_GS_BASE: 1346 case MSR_GS_BASE:
1347 vmx_segment_cache_clear(vmx);
1249 vmcs_writel(GUEST_GS_BASE, data); 1348 vmcs_writel(GUEST_GS_BASE, data);
1250 break; 1349 break;
1251 case MSR_KERNEL_GS_BASE: 1350 case MSR_KERNEL_GS_BASE:
@@ -1689,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1689 vmx->emulation_required = 1; 1788 vmx->emulation_required = 1;
1690 vmx->rmode.vm86_active = 0; 1789 vmx->rmode.vm86_active = 0;
1691 1790
1791 vmx_segment_cache_clear(vmx);
1792
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); 1793 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1794 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1795 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
@@ -1712,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1712 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 1813 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1713 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 1814 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1714 1815
1816 vmx_segment_cache_clear(vmx);
1817
1715 vmcs_write16(GUEST_SS_SELECTOR, 0); 1818 vmcs_write16(GUEST_SS_SELECTOR, 0);
1716 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1819 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1717 1820
@@ -1775,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1878 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 } 1879 }
1777 1880
1881 vmx_segment_cache_clear(vmx);
1882
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); 1883 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1884 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1885 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1851,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1851{ 1956{
1852 u32 guest_tr_ar; 1957 u32 guest_tr_ar;
1853 1958
1959 vmx_segment_cache_clear(to_vmx(vcpu));
1960
1854 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 1961 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1855 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 1962 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1856 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 1963 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1998,6 +2105,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1998 vmcs_writel(CR0_READ_SHADOW, cr0); 2105 vmcs_writel(CR0_READ_SHADOW, cr0);
1999 vmcs_writel(GUEST_CR0, hw_cr0); 2106 vmcs_writel(GUEST_CR0, hw_cr0);
2000 vcpu->arch.cr0 = cr0; 2107 vcpu->arch.cr0 = cr0;
2108 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2001} 2109}
2002 2110
2003static u64 construct_eptp(unsigned long root_hpa) 2111static u64 construct_eptp(unsigned long root_hpa)
@@ -2053,7 +2161,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2053 struct kvm_segment *var, int seg) 2161 struct kvm_segment *var, int seg)
2054{ 2162{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu); 2163 struct vcpu_vmx *vmx = to_vmx(vcpu);
2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save; 2164 struct kvm_save_segment *save;
2058 u32 ar; 2165 u32 ar;
2059 2166
@@ -2075,13 +2182,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2075 var->limit = save->limit; 2182 var->limit = save->limit;
2076 ar = save->ar; 2183 ar = save->ar;
2077 if (seg == VCPU_SREG_TR 2184 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector)) 2185 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2079 goto use_saved_rmode_seg; 2186 goto use_saved_rmode_seg;
2080 } 2187 }
2081 var->base = vmcs_readl(sf->base); 2188 var->base = vmx_read_guest_seg_base(vmx, seg);
2082 var->limit = vmcs_read32(sf->limit); 2189 var->limit = vmx_read_guest_seg_limit(vmx, seg);
2083 var->selector = vmcs_read16(sf->selector); 2190 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2084 ar = vmcs_read32(sf->ar_bytes); 2191 ar = vmx_read_guest_seg_ar(vmx, seg);
2085use_saved_rmode_seg: 2192use_saved_rmode_seg:
2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2193 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2087 ar = 0; 2194 ar = 0;
@@ -2098,27 +2205,37 @@ use_saved_rmode_seg:
2098 2205
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 2206static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{ 2207{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s; 2208 struct kvm_segment s;
2103 2209
2104 if (to_vmx(vcpu)->rmode.vm86_active) { 2210 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg); 2211 vmx_get_segment(vcpu, &s, seg);
2106 return s.base; 2212 return s.base;
2107 } 2213 }
2108 return vmcs_readl(sf->base); 2214 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
2109} 2215}
2110 2216
2111static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2217static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
2112{ 2218{
2113 if (!is_protmode(vcpu)) 2219 if (!is_protmode(vcpu))
2114 return 0; 2220 return 0;
2115 2221
2116 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 2222 if (!is_long_mode(vcpu)
2223 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
2117 return 3; 2224 return 3;
2118 2225
2119 return vmcs_read16(GUEST_CS_SELECTOR) & 3; 2226 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
2120} 2227}
2121 2228
2229static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2230{
2231 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
2232 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2233 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
2234 }
2235 return to_vmx(vcpu)->cpl;
2236}
2237
2238
2122static u32 vmx_segment_access_rights(struct kvm_segment *var) 2239static u32 vmx_segment_access_rights(struct kvm_segment *var)
2123{ 2240{
2124 u32 ar; 2241 u32 ar;
@@ -2148,6 +2265,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2148 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2265 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2149 u32 ar; 2266 u32 ar;
2150 2267
2268 vmx_segment_cache_clear(vmx);
2269
2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2270 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector); 2271 vmcs_write16(sf->selector, var->selector);
2153 vmx->rmode.tr.selector = var->selector; 2272 vmx->rmode.tr.selector = var->selector;
@@ -2184,11 +2303,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2184 ar |= 0x1; /* Accessed */ 2303 ar |= 0x1; /* Accessed */
2185 2304
2186 vmcs_write32(sf->ar_bytes, ar); 2305 vmcs_write32(sf->ar_bytes, ar);
2306 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2187} 2307}
2188 2308
2189static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2309static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2190{ 2310{
2191 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); 2311 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
2192 2312
2193 *db = (ar >> 14) & 1; 2313 *db = (ar >> 14) & 1;
2194 *l = (ar >> 13) & 1; 2314 *l = (ar >> 13) & 1;
@@ -2775,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2775 if (ret != 0) 2895 if (ret != 0)
2776 goto out; 2896 goto out;
2777 2897
2898 vmx_segment_cache_clear(vmx);
2899
2778 seg_setup(VCPU_SREG_CS); 2900 seg_setup(VCPU_SREG_CS);
2779 /* 2901 /*
2780 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2902 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2904,7 +3026,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2904 3026
2905 ++vcpu->stat.irq_injections; 3027 ++vcpu->stat.irq_injections;
2906 if (vmx->rmode.vm86_active) { 3028 if (vmx->rmode.vm86_active) {
2907 if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) 3029 int inc_eip = 0;
3030 if (vcpu->arch.interrupt.soft)
3031 inc_eip = vcpu->arch.event_exit_inst_len;
3032 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
2908 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3033 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2909 return; 3034 return;
2910 } 3035 }
@@ -2937,8 +3062,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2937 } 3062 }
2938 3063
2939 ++vcpu->stat.nmi_injections; 3064 ++vcpu->stat.nmi_injections;
3065 vmx->nmi_known_unmasked = false;
2940 if (vmx->rmode.vm86_active) { 3066 if (vmx->rmode.vm86_active) {
2941 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) 3067 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
2942 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3068 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2943 return; 3069 return;
2944 } 3070 }
@@ -2961,6 +3087,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2961{ 3087{
2962 if (!cpu_has_virtual_nmis()) 3088 if (!cpu_has_virtual_nmis())
2963 return to_vmx(vcpu)->soft_vnmi_blocked; 3089 return to_vmx(vcpu)->soft_vnmi_blocked;
3090 if (to_vmx(vcpu)->nmi_known_unmasked)
3091 return false;
2964 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 3092 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2965} 3093}
2966 3094
@@ -2974,6 +3102,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2974 vmx->vnmi_blocked_time = 0; 3102 vmx->vnmi_blocked_time = 0;
2975 } 3103 }
2976 } else { 3104 } else {
3105 vmx->nmi_known_unmasked = !masked;
2977 if (masked) 3106 if (masked)
2978 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3107 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2979 GUEST_INTR_STATE_NMI); 3108 GUEST_INTR_STATE_NMI);
@@ -3091,7 +3220,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3091 enum emulation_result er; 3220 enum emulation_result er;
3092 3221
3093 vect_info = vmx->idt_vectoring_info; 3222 vect_info = vmx->idt_vectoring_info;
3094 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3223 intr_info = vmx->exit_intr_info;
3095 3224
3096 if (is_machine_check(intr_info)) 3225 if (is_machine_check(intr_info))
3097 return handle_machine_check(vcpu); 3226 return handle_machine_check(vcpu);
@@ -3122,7 +3251,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3122 } 3251 }
3123 3252
3124 error_code = 0; 3253 error_code = 0;
3125 rip = kvm_rip_read(vcpu);
3126 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 3254 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3127 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 3255 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3128 if (is_page_fault(intr_info)) { 3256 if (is_page_fault(intr_info)) {
@@ -3169,6 +3297,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3169 vmx->vcpu.arch.event_exit_inst_len = 3297 vmx->vcpu.arch.event_exit_inst_len =
3170 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3298 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3171 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3299 kvm_run->exit_reason = KVM_EXIT_DEBUG;
3300 rip = kvm_rip_read(vcpu);
3172 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 3301 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3173 kvm_run->debug.arch.exception = ex_no; 3302 kvm_run->debug.arch.exception = ex_no;
3174 break; 3303 break;
@@ -3505,9 +3634,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3505 switch (type) { 3634 switch (type) {
3506 case INTR_TYPE_NMI_INTR: 3635 case INTR_TYPE_NMI_INTR:
3507 vcpu->arch.nmi_injected = false; 3636 vcpu->arch.nmi_injected = false;
3508 if (cpu_has_virtual_nmis()) 3637 vmx_set_nmi_mask(vcpu, true);
3509 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3510 GUEST_INTR_STATE_NMI);
3511 break; 3638 break;
3512 case INTR_TYPE_EXT_INTR: 3639 case INTR_TYPE_EXT_INTR:
3513 case INTR_TYPE_SOFT_INTR: 3640 case INTR_TYPE_SOFT_INTR:
@@ -3867,12 +3994,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3867 3994
3868static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 3995static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3869{ 3996{
3870 u32 exit_intr_info = vmx->exit_intr_info; 3997 u32 exit_intr_info;
3998
3999 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
4000 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
4001 return;
4002
4003 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4004 exit_intr_info = vmx->exit_intr_info;
3871 4005
3872 /* Handle machine checks before interrupts are enabled */ 4006 /* Handle machine checks before interrupts are enabled */
3873 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 4007 if (is_machine_check(exit_intr_info))
3874 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3875 && is_machine_check(exit_intr_info)))
3876 kvm_machine_check(); 4008 kvm_machine_check();
3877 4009
3878 /* We need to handle NMIs before interrupts are enabled */ 4010 /* We need to handle NMIs before interrupts are enabled */
@@ -3886,7 +4018,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3886 4018
3887static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 4019static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3888{ 4020{
3889 u32 exit_intr_info = vmx->exit_intr_info; 4021 u32 exit_intr_info;
3890 bool unblock_nmi; 4022 bool unblock_nmi;
3891 u8 vector; 4023 u8 vector;
3892 bool idtv_info_valid; 4024 bool idtv_info_valid;
@@ -3894,6 +4026,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3894 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 4026 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3895 4027
3896 if (cpu_has_virtual_nmis()) { 4028 if (cpu_has_virtual_nmis()) {
4029 if (vmx->nmi_known_unmasked)
4030 return;
4031 /*
4032 * Can't use vmx->exit_intr_info since we're not sure what
4033 * the exit reason is.
4034 */
4035 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3897 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 4036 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3898 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 4037 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3899 /* 4038 /*
@@ -3910,6 +4049,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3910 vector != DF_VECTOR && !idtv_info_valid) 4049 vector != DF_VECTOR && !idtv_info_valid)
3911 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4050 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3912 GUEST_INTR_STATE_NMI); 4051 GUEST_INTR_STATE_NMI);
4052 else
4053 vmx->nmi_known_unmasked =
4054 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
4055 & GUEST_INTR_STATE_NMI);
3913 } else if (unlikely(vmx->soft_vnmi_blocked)) 4056 } else if (unlikely(vmx->soft_vnmi_blocked))
3914 vmx->vnmi_blocked_time += 4057 vmx->vnmi_blocked_time +=
3915 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 4058 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
@@ -3946,8 +4089,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3946 * Clear bit "block by NMI" before VM entry if a NMI 4089 * Clear bit "block by NMI" before VM entry if a NMI
3947 * delivery faulted. 4090 * delivery faulted.
3948 */ 4091 */
3949 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4092 vmx_set_nmi_mask(&vmx->vcpu, false);
3950 GUEST_INTR_STATE_NMI);
3951 break; 4093 break;
3952 case INTR_TYPE_SOFT_EXCEPTION: 4094 case INTR_TYPE_SOFT_EXCEPTION:
3953 vmx->vcpu.arch.event_exit_inst_len = 4095 vmx->vcpu.arch.event_exit_inst_len =
@@ -4124,7 +4266,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4124 ); 4266 );
4125 4267
4126 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4268 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4269 | (1 << VCPU_EXREG_RFLAGS)
4270 | (1 << VCPU_EXREG_CPL)
4127 | (1 << VCPU_EXREG_PDPTR) 4271 | (1 << VCPU_EXREG_PDPTR)
4272 | (1 << VCPU_EXREG_SEGMENTS)
4128 | (1 << VCPU_EXREG_CR3)); 4273 | (1 << VCPU_EXREG_CR3));
4129 vcpu->arch.regs_dirty = 0; 4274 vcpu->arch.regs_dirty = 0;
4130 4275
@@ -4134,7 +4279,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4134 vmx->launched = 1; 4279 vmx->launched = 1;
4135 4280
4136 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 4281 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4137 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4138 4282
4139 vmx_complete_atomic_exit(vmx); 4283 vmx_complete_atomic_exit(vmx);
4140 vmx_recover_nmi_blocking(vmx); 4284 vmx_recover_nmi_blocking(vmx);
@@ -4195,8 +4339,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4195 goto free_vcpu; 4339 goto free_vcpu;
4196 4340
4197 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 4341 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4342 err = -ENOMEM;
4198 if (!vmx->guest_msrs) { 4343 if (!vmx->guest_msrs) {
4199 err = -ENOMEM;
4200 goto uninit_vcpu; 4344 goto uninit_vcpu;
4201 } 4345 }
4202 4346
@@ -4215,7 +4359,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4215 if (err) 4359 if (err)
4216 goto free_vmcs; 4360 goto free_vmcs;
4217 if (vm_need_virtualize_apic_accesses(kvm)) 4361 if (vm_need_virtualize_apic_accesses(kvm))
4218 if (alloc_apic_access_page(kvm) != 0) 4362 err = alloc_apic_access_page(kvm);
4363 if (err)
4219 goto free_vmcs; 4364 goto free_vmcs;
4220 4365
4221 if (enable_ept) { 4366 if (enable_ept) {
@@ -4368,6 +4513,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4368{ 4513{
4369} 4514}
4370 4515
4516static int vmx_check_intercept(struct kvm_vcpu *vcpu,
4517 struct x86_instruction_info *info,
4518 enum x86_intercept_stage stage)
4519{
4520 return X86EMUL_CONTINUE;
4521}
4522
4371static struct kvm_x86_ops vmx_x86_ops = { 4523static struct kvm_x86_ops vmx_x86_ops = {
4372 .cpu_has_kvm_support = cpu_has_kvm_support, 4524 .cpu_has_kvm_support = cpu_has_kvm_support,
4373 .disabled_by_bios = vmx_disabled_by_bios, 4525 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4449,10 +4601,14 @@ static struct kvm_x86_ops vmx_x86_ops = {
4449 4601
4450 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4602 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4451 4603
4604 .set_tsc_khz = vmx_set_tsc_khz,
4452 .write_tsc_offset = vmx_write_tsc_offset, 4605 .write_tsc_offset = vmx_write_tsc_offset,
4453 .adjust_tsc_offset = vmx_adjust_tsc_offset, 4606 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4607 .compute_tsc_offset = vmx_compute_tsc_offset,
4454 4608
4455 .set_tdp_cr3 = vmx_set_cr3, 4609 .set_tdp_cr3 = vmx_set_cr3,
4610
4611 .check_intercept = vmx_check_intercept,
4456}; 4612};
4457 4613
4458static int __init vmx_init(void) 4614static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 934b4c6b0bf9..77c9d8673dc4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,22 +60,12 @@
60#include <asm/div64.h> 60#include <asm/div64.h>
61 61
62#define MAX_IO_MSRS 256 62#define MAX_IO_MSRS 256
63#define CR0_RESERVED_BITS \
64 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
65 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
66 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
67#define CR4_RESERVED_BITS \
68 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
69 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
70 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
71 | X86_CR4_OSXSAVE \
72 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
73
74#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
75
76#define KVM_MAX_MCE_BANKS 32 63#define KVM_MAX_MCE_BANKS 32
77#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) 64#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
78 65
66#define emul_to_vcpu(ctxt) \
67 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
68
79/* EFER defaults: 69/* EFER defaults:
80 * - enable syscall per default because its emulated by KVM 70 * - enable syscall per default because its emulated by KVM
81 * - enable LME and LMA per default on 64 bit KVM 71 * - enable LME and LMA per default on 64 bit KVM
@@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
100int ignore_msrs = 0; 90int ignore_msrs = 0;
101module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
102 92
93bool kvm_has_tsc_control;
94EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
95u32 kvm_max_guest_tsc_khz;
96EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
97
103#define KVM_NR_SHARED_MSRS 16 98#define KVM_NR_SHARED_MSRS 16
104 99
105struct kvm_shared_msrs_global { 100struct kvm_shared_msrs_global {
@@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
157 152
158u64 __read_mostly host_xcr0; 153u64 __read_mostly host_xcr0;
159 154
155int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
156
160static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 157static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
161{ 158{
162 int i; 159 int i;
@@ -361,8 +358,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
361 358
362void kvm_inject_nmi(struct kvm_vcpu *vcpu) 359void kvm_inject_nmi(struct kvm_vcpu *vcpu)
363{ 360{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
365 kvm_make_request(KVM_REQ_EVENT, vcpu); 361 kvm_make_request(KVM_REQ_EVENT, vcpu);
362 vcpu->arch.nmi_pending = 1;
366} 363}
367EXPORT_SYMBOL_GPL(kvm_inject_nmi); 364EXPORT_SYMBOL_GPL(kvm_inject_nmi);
368 365
@@ -982,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void)
982 return ret; 979 return ret;
983} 980}
984 981
985static inline u64 nsec_to_cycles(u64 nsec) 982static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
983{
984 if (vcpu->arch.virtual_tsc_khz)
985 return vcpu->arch.virtual_tsc_khz;
986 else
987 return __this_cpu_read(cpu_tsc_khz);
988}
989
990static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
986{ 991{
987 u64 ret; 992 u64 ret;
988 993
@@ -990,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
990 if (kvm_tsc_changes_freq()) 995 if (kvm_tsc_changes_freq())
991 printk_once(KERN_WARNING 996 printk_once(KERN_WARNING
992 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 997 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
993 ret = nsec * __this_cpu_read(cpu_tsc_khz); 998 ret = nsec * vcpu_tsc_khz(vcpu);
994 do_div(ret, USEC_PER_SEC); 999 do_div(ret, USEC_PER_SEC);
995 return ret; 1000 return ret;
996} 1001}
997 1002
998static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) 1003static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
999{ 1004{
1000 /* Compute a scale to convert nanoseconds in TSC cycles */ 1005 /* Compute a scale to convert nanoseconds in TSC cycles */
1001 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1006 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1002 &kvm->arch.virtual_tsc_shift, 1007 &vcpu->arch.tsc_catchup_shift,
1003 &kvm->arch.virtual_tsc_mult); 1008 &vcpu->arch.tsc_catchup_mult);
1004 kvm->arch.virtual_tsc_khz = this_tsc_khz;
1005} 1009}
1006 1010
1007static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1011static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1008{ 1012{
1009 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1013 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1010 vcpu->kvm->arch.virtual_tsc_mult, 1014 vcpu->arch.tsc_catchup_mult,
1011 vcpu->kvm->arch.virtual_tsc_shift); 1015 vcpu->arch.tsc_catchup_shift);
1012 tsc += vcpu->arch.last_tsc_write; 1016 tsc += vcpu->arch.last_tsc_write;
1013 return tsc; 1017 return tsc;
1014} 1018}
@@ -1021,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 s64 sdiff; 1025 s64 sdiff;
1022 1026
1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1027 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1024 offset = data - native_read_tsc(); 1028 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1025 ns = get_kernel_ns(); 1029 ns = get_kernel_ns();
1026 elapsed = ns - kvm->arch.last_tsc_nsec; 1030 elapsed = ns - kvm->arch.last_tsc_nsec;
1027 sdiff = data - kvm->arch.last_tsc_write; 1031 sdiff = data - kvm->arch.last_tsc_write;
@@ -1037,13 +1041,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1037 * In that case, for a reliable TSC, we can match TSC offsets, 1041 * In that case, for a reliable TSC, we can match TSC offsets,
1038 * or make a best guest using elapsed value. 1042 * or make a best guest using elapsed value.
1039 */ 1043 */
1040 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && 1044 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1041 elapsed < 5ULL * NSEC_PER_SEC) { 1045 elapsed < 5ULL * NSEC_PER_SEC) {
1042 if (!check_tsc_unstable()) { 1046 if (!check_tsc_unstable()) {
1043 offset = kvm->arch.last_tsc_offset; 1047 offset = kvm->arch.last_tsc_offset;
1044 pr_debug("kvm: matched tsc offset for %llu\n", data); 1048 pr_debug("kvm: matched tsc offset for %llu\n", data);
1045 } else { 1049 } else {
1046 u64 delta = nsec_to_cycles(elapsed); 1050 u64 delta = nsec_to_cycles(vcpu, elapsed);
1047 offset += delta; 1051 offset += delta;
1048 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1052 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1049 } 1053 }
@@ -1075,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1075 local_irq_save(flags); 1079 local_irq_save(flags);
1076 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1080 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1077 kernel_ns = get_kernel_ns(); 1081 kernel_ns = get_kernel_ns();
1078 this_tsc_khz = __this_cpu_read(cpu_tsc_khz); 1082 this_tsc_khz = vcpu_tsc_khz(v);
1079
1080 if (unlikely(this_tsc_khz == 0)) { 1083 if (unlikely(this_tsc_khz == 0)) {
1081 local_irq_restore(flags); 1084 local_irq_restore(flags);
1082 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1085 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1993,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1993 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1996 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1994 case KVM_CAP_XSAVE: 1997 case KVM_CAP_XSAVE:
1995 case KVM_CAP_ASYNC_PF: 1998 case KVM_CAP_ASYNC_PF:
1999 case KVM_CAP_GET_TSC_KHZ:
1996 r = 1; 2000 r = 1;
1997 break; 2001 break;
1998 case KVM_CAP_COALESCED_MMIO: 2002 case KVM_CAP_COALESCED_MMIO:
@@ -2019,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2019 case KVM_CAP_XCRS: 2023 case KVM_CAP_XCRS:
2020 r = cpu_has_xsave; 2024 r = cpu_has_xsave;
2021 break; 2025 break;
2026 case KVM_CAP_TSC_CONTROL:
2027 r = kvm_has_tsc_control;
2028 break;
2022 default: 2029 default:
2023 r = 0; 2030 r = 0;
2024 break; 2031 break;
@@ -2120,8 +2127,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2120 kvm_x86_ops->vcpu_load(vcpu, cpu); 2127 kvm_x86_ops->vcpu_load(vcpu, cpu);
2121 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2128 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2122 /* Make sure TSC doesn't go backwards */ 2129 /* Make sure TSC doesn't go backwards */
2123 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2130 s64 tsc_delta;
2124 native_read_tsc() - vcpu->arch.last_host_tsc; 2131 u64 tsc;
2132
2133 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
2134 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2135 tsc - vcpu->arch.last_guest_tsc;
2136
2125 if (tsc_delta < 0) 2137 if (tsc_delta < 0)
2126 mark_tsc_unstable("KVM discovered backwards TSC"); 2138 mark_tsc_unstable("KVM discovered backwards TSC");
2127 if (check_tsc_unstable()) { 2139 if (check_tsc_unstable()) {
@@ -2139,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2139{ 2151{
2140 kvm_x86_ops->vcpu_put(vcpu); 2152 kvm_x86_ops->vcpu_put(vcpu);
2141 kvm_put_guest_fpu(vcpu); 2153 kvm_put_guest_fpu(vcpu);
2142 vcpu->arch.last_host_tsc = native_read_tsc(); 2154 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
2143} 2155}
2144 2156
2145static int is_efer_nx(void) 2157static int is_efer_nx(void)
@@ -2324,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2324 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2336 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2325 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2337 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2326 2338
2339 /* cpuid 0xC0000001.edx */
2340 const u32 kvm_supported_word5_x86_features =
2341 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN);
2344
2327 /* all calls to cpuid_count() should be made on the same cpu */ 2345 /* all calls to cpuid_count() should be made on the same cpu */
2328 get_cpu(); 2346 get_cpu();
2329 do_cpuid_1_ent(entry, function, index); 2347 do_cpuid_1_ent(entry, function, index);
@@ -2418,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2418 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2436 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2419 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2437 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2420 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2438 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) |
2421 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2422 entry->ebx = 0; 2441 entry->ebx = 0;
2423 entry->ecx = 0; 2442 entry->ecx = 0;
@@ -2432,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2432 entry->ecx &= kvm_supported_word6_x86_features; 2451 entry->ecx &= kvm_supported_word6_x86_features;
2433 cpuid_mask(&entry->ecx, 6); 2452 cpuid_mask(&entry->ecx, 6);
2434 break; 2453 break;
2454 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/
2457 entry->eax = min(entry->eax, 0xC0000004);
2458 break;
2459 case 0xC0000001:
2460 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5);
2462 break;
2463 case 0xC0000002:
2464 case 0xC0000003:
2465 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/
2467 break;
2435 } 2468 }
2436 2469
2437 kvm_x86_ops->set_supported_cpuid(function, entry); 2470 kvm_x86_ops->set_supported_cpuid(function, entry);
@@ -2478,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2478 if (nent >= cpuid->nent) 2511 if (nent >= cpuid->nent)
2479 goto out_free; 2512 goto out_free;
2480 2513
2514 /* Add support for Centaur's CPUID instruction. */
2515 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2516 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2517 &nent, cpuid->nent);
2518
2519 r = -E2BIG;
2520 if (nent >= cpuid->nent)
2521 goto out_free;
2522
2523 limit = cpuid_entries[nent - 1].eax;
2524 for (func = 0xC0000001;
2525 func <= limit && nent < cpuid->nent; ++func)
2526 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2527 &nent, cpuid->nent);
2528
2529 r = -E2BIG;
2530 if (nent >= cpuid->nent)
2531 goto out_free;
2532 }
2533
2481 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2534 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2482 cpuid->nent); 2535 cpuid->nent);
2483 2536
@@ -3046,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3046 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3099 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3047 break; 3100 break;
3048 } 3101 }
3102 case KVM_SET_TSC_KHZ: {
3103 u32 user_tsc_khz;
3104
3105 r = -EINVAL;
3106 if (!kvm_has_tsc_control)
3107 break;
3108
3109 user_tsc_khz = (u32)arg;
3110
3111 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3112 goto out;
3113
3114 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
3115
3116 r = 0;
3117 goto out;
3118 }
3119 case KVM_GET_TSC_KHZ: {
3120 r = -EIO;
3121 if (check_tsc_unstable())
3122 goto out;
3123
3124 r = vcpu_tsc_khz(vcpu);
3125
3126 goto out;
3127 }
3049 default: 3128 default:
3050 r = -EINVAL; 3129 r = -EINVAL;
3051 } 3130 }
@@ -3595,20 +3674,43 @@ static void kvm_init_msr_list(void)
3595static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3674static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3596 const void *v) 3675 const void *v)
3597{ 3676{
3598 if (vcpu->arch.apic && 3677 int handled = 0;
3599 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3678 int n;
3600 return 0;
3601 3679
3602 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3680 do {
3681 n = min(len, 8);
3682 if (!(vcpu->arch.apic &&
3683 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3684 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3685 break;
3686 handled += n;
3687 addr += n;
3688 len -= n;
3689 v += n;
3690 } while (len);
3691
3692 return handled;
3603} 3693}
3604 3694
3605static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3695static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3606{ 3696{
3607 if (vcpu->arch.apic && 3697 int handled = 0;
3608 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3698 int n;
3609 return 0; 3699
3700 do {
3701 n = min(len, 8);
3702 if (!(vcpu->arch.apic &&
3703 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3704 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3705 break;
3706 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3707 handled += n;
3708 addr += n;
3709 len -= n;
3710 v += n;
3711 } while (len);
3610 3712
3611 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3713 return handled;
3612} 3714}
3613 3715
3614static void kvm_set_segment(struct kvm_vcpu *vcpu, 3716static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3703,37 +3805,43 @@ out:
3703} 3805}
3704 3806
3705/* used for instruction fetching */ 3807/* used for instruction fetching */
3706static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3808static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3707 struct kvm_vcpu *vcpu, 3809 gva_t addr, void *val, unsigned int bytes,
3708 struct x86_exception *exception) 3810 struct x86_exception *exception)
3709{ 3811{
3812 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3710 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3813 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3814
3711 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3815 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3712 access | PFERR_FETCH_MASK, 3816 access | PFERR_FETCH_MASK,
3713 exception); 3817 exception);
3714} 3818}
3715 3819
3716static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3717 struct kvm_vcpu *vcpu, 3821 gva_t addr, void *val, unsigned int bytes,
3718 struct x86_exception *exception) 3822 struct x86_exception *exception)
3719{ 3823{
3824 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3720 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3825 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3826
3721 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3722 exception); 3828 exception);
3723} 3829}
3724 3830
3725static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3726 struct kvm_vcpu *vcpu, 3832 gva_t addr, void *val, unsigned int bytes,
3727 struct x86_exception *exception) 3833 struct x86_exception *exception)
3728{ 3834{
3835 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3729 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3730} 3837}
3731 3838
3732static int kvm_write_guest_virt_system(gva_t addr, void *val, 3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val,
3733 unsigned int bytes, 3841 unsigned int bytes,
3734 struct kvm_vcpu *vcpu,
3735 struct x86_exception *exception) 3842 struct x86_exception *exception)
3736{ 3843{
3844 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3737 void *data = val; 3845 void *data = val;
3738 int r = X86EMUL_CONTINUE; 3846 int r = X86EMUL_CONTINUE;
3739 3847
@@ -3761,13 +3869,15 @@ out:
3761 return r; 3869 return r;
3762} 3870}
3763 3871
3764static int emulator_read_emulated(unsigned long addr, 3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr,
3765 void *val, 3874 void *val,
3766 unsigned int bytes, 3875 unsigned int bytes,
3767 struct x86_exception *exception, 3876 struct x86_exception *exception)
3768 struct kvm_vcpu *vcpu)
3769{ 3877{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3770 gpa_t gpa; 3879 gpa_t gpa;
3880 int handled;
3771 3881
3772 if (vcpu->mmio_read_completed) { 3882 if (vcpu->mmio_read_completed) {
3773 memcpy(val, vcpu->mmio_data, bytes); 3883 memcpy(val, vcpu->mmio_data, bytes);
@@ -3786,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr,
3786 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3787 goto mmio; 3897 goto mmio;
3788 3898
3789 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) 3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
3790 == X86EMUL_CONTINUE) 3900 == X86EMUL_CONTINUE)
3791 return X86EMUL_CONTINUE; 3901 return X86EMUL_CONTINUE;
3792 3902
@@ -3794,18 +3904,24 @@ mmio:
3794 /* 3904 /*
3795 * Is this MMIO handled locally? 3905 * Is this MMIO handled locally?
3796 */ 3906 */
3797 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3907 handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
3798 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3908
3909 if (handled == bytes)
3799 return X86EMUL_CONTINUE; 3910 return X86EMUL_CONTINUE;
3800 } 3911
3912 gpa += handled;
3913 bytes -= handled;
3914 val += handled;
3801 3915
3802 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3916 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3803 3917
3804 vcpu->mmio_needed = 1; 3918 vcpu->mmio_needed = 1;
3805 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3919 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3806 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3920 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3807 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3921 vcpu->mmio_size = bytes;
3922 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3808 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3923 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3924 vcpu->mmio_index = 0;
3809 3925
3810 return X86EMUL_IO_NEEDED; 3926 return X86EMUL_IO_NEEDED;
3811} 3927}
@@ -3829,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
3829 struct kvm_vcpu *vcpu) 3945 struct kvm_vcpu *vcpu)
3830{ 3946{
3831 gpa_t gpa; 3947 gpa_t gpa;
3948 int handled;
3832 3949
3833 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3834 3951
@@ -3847,25 +3964,35 @@ mmio:
3847 /* 3964 /*
3848 * Is this MMIO handled locally? 3965 * Is this MMIO handled locally?
3849 */ 3966 */
3850 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3967 handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
3968 if (handled == bytes)
3851 return X86EMUL_CONTINUE; 3969 return X86EMUL_CONTINUE;
3852 3970
3971 gpa += handled;
3972 bytes -= handled;
3973 val += handled;
3974
3853 vcpu->mmio_needed = 1; 3975 vcpu->mmio_needed = 1;
3976 memcpy(vcpu->mmio_data, val, bytes);
3854 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3977 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3855 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3978 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3856 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3979 vcpu->mmio_size = bytes;
3980 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3857 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3981 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3858 memcpy(vcpu->run->mmio.data, val, bytes); 3982 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
3983 vcpu->mmio_index = 0;
3859 3984
3860 return X86EMUL_CONTINUE; 3985 return X86EMUL_CONTINUE;
3861} 3986}
3862 3987
3863int emulator_write_emulated(unsigned long addr, 3988int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
3989 unsigned long addr,
3864 const void *val, 3990 const void *val,
3865 unsigned int bytes, 3991 unsigned int bytes,
3866 struct x86_exception *exception, 3992 struct x86_exception *exception)
3867 struct kvm_vcpu *vcpu)
3868{ 3993{
3994 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3995
3869 /* Crossing a page boundary? */ 3996 /* Crossing a page boundary? */
3870 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3997 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3871 int rc, now; 3998 int rc, now;
@@ -3893,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr,
3893 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 4020 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3894#endif 4021#endif
3895 4022
3896static int emulator_cmpxchg_emulated(unsigned long addr, 4023static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4024 unsigned long addr,
3897 const void *old, 4025 const void *old,
3898 const void *new, 4026 const void *new,
3899 unsigned int bytes, 4027 unsigned int bytes,
3900 struct x86_exception *exception, 4028 struct x86_exception *exception)
3901 struct kvm_vcpu *vcpu)
3902{ 4029{
4030 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3903 gpa_t gpa; 4031 gpa_t gpa;
3904 struct page *page; 4032 struct page *page;
3905 char *kaddr; 4033 char *kaddr;
@@ -3955,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3955emul_write: 4083emul_write:
3956 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 4084 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3957 4085
3958 return emulator_write_emulated(addr, new, bytes, exception, vcpu); 4086 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
3959} 4087}
3960 4088
3961static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 4089static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3974,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3974} 4102}
3975 4103
3976 4104
3977static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 4105static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
3978 unsigned int count, struct kvm_vcpu *vcpu) 4106 int size, unsigned short port, void *val,
4107 unsigned int count)
3979{ 4108{
4109 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4110
3980 if (vcpu->arch.pio.count) 4111 if (vcpu->arch.pio.count)
3981 goto data_avail; 4112 goto data_avail;
3982 4113
@@ -4004,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
4004 return 0; 4135 return 0;
4005} 4136}
4006 4137
4007static int emulator_pio_out_emulated(int size, unsigned short port, 4138static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4008 const void *val, unsigned int count, 4139 int size, unsigned short port,
4009 struct kvm_vcpu *vcpu) 4140 const void *val, unsigned int count)
4010{ 4141{
4142 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4143
4011 trace_kvm_pio(1, port, size, count); 4144 trace_kvm_pio(1, port, size, count);
4012 4145
4013 vcpu->arch.pio.port = port; 4146 vcpu->arch.pio.port = port;
@@ -4037,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4037 return kvm_x86_ops->get_segment_base(vcpu, seg); 4170 return kvm_x86_ops->get_segment_base(vcpu, seg);
4038} 4171}
4039 4172
4040int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 4173static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4041{ 4174{
4042 kvm_mmu_invlpg(vcpu, address); 4175 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4043 return X86EMUL_CONTINUE;
4044} 4176}
4045 4177
4046int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4178int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -4062,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4062} 4194}
4063EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4195EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4064 4196
4065int emulate_clts(struct kvm_vcpu *vcpu) 4197static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4066{ 4198{
4067 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4199 kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4068 kvm_x86_ops->fpu_activate(vcpu);
4069 return X86EMUL_CONTINUE;
4070} 4200}
4071 4201
4072int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 4202int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4073{ 4203{
4074 return _kvm_get_dr(vcpu, dr, dest); 4204 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4075} 4205}
4076 4206
4077int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 4207int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4078{ 4208{
4079 4209
4080 return __kvm_set_dr(vcpu, dr, value); 4210 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4081} 4211}
4082 4212
4083static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4213static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -4085,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4085 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4215 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4086} 4216}
4087 4217
4088static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 4218static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4089{ 4219{
4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4090 unsigned long value; 4221 unsigned long value;
4091 4222
4092 switch (cr) { 4223 switch (cr) {
@@ -4113,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4113 return value; 4244 return value;
4114} 4245}
4115 4246
4116static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 4247static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4117{ 4248{
4249 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4118 int res = 0; 4250 int res = 0;
4119 4251
4120 switch (cr) { 4252 switch (cr) {
@@ -4141,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4141 return res; 4273 return res;
4142} 4274}
4143 4275
4144static int emulator_get_cpl(struct kvm_vcpu *vcpu) 4276static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4277{
4278 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4279}
4280
4281static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4282{
4283 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4284}
4285
4286static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4145{ 4287{
4146 return kvm_x86_ops->get_cpl(vcpu); 4288 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4147} 4289}
4148 4290
4149static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4291static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4150{ 4292{
4151 kvm_x86_ops->get_gdt(vcpu, dt); 4293 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4152} 4294}
4153 4295
4154static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4296static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4155{ 4297{
4156 kvm_x86_ops->get_idt(vcpu, dt); 4298 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4157} 4299}
4158 4300
4159static unsigned long emulator_get_cached_segment_base(int seg, 4301static unsigned long emulator_get_cached_segment_base(
4160 struct kvm_vcpu *vcpu) 4302 struct x86_emulate_ctxt *ctxt, int seg)
4161{ 4303{
4162 return get_segment_base(vcpu, seg); 4304 return get_segment_base(emul_to_vcpu(ctxt), seg);
4163} 4305}
4164 4306
4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, 4307static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4166 int seg, struct kvm_vcpu *vcpu) 4308 struct desc_struct *desc, u32 *base3,
4309 int seg)
4167{ 4310{
4168 struct kvm_segment var; 4311 struct kvm_segment var;
4169 4312
4170 kvm_get_segment(vcpu, &var, seg); 4313 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4314 *selector = var.selector;
4171 4315
4172 if (var.unusable) 4316 if (var.unusable)
4173 return false; 4317 return false;
@@ -4192,14 +4336,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4192 return true; 4336 return true;
4193} 4337}
4194 4338
4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, 4339static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4196 int seg, struct kvm_vcpu *vcpu) 4340 struct desc_struct *desc, u32 base3,
4341 int seg)
4197{ 4342{
4343 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4198 struct kvm_segment var; 4344 struct kvm_segment var;
4199 4345
4200 /* needed to preserve selector */ 4346 var.selector = selector;
4201 kvm_get_segment(vcpu, &var, seg);
4202
4203 var.base = get_desc_base(desc); 4347 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64 4348#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32; 4349 var.base |= ((u64)base3) << 32;
@@ -4223,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4223 return; 4367 return;
4224} 4368}
4225 4369
4226static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 4370static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4371 u32 msr_index, u64 *pdata)
4227{ 4372{
4228 struct kvm_segment kvm_seg; 4373 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4374}
4229 4375
4230 kvm_get_segment(vcpu, &kvm_seg, seg); 4376static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4231 return kvm_seg.selector; 4377 u32 msr_index, u64 data)
4378{
4379 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4232} 4380}
4233 4381
4234static void emulator_set_segment_selector(u16 sel, int seg, 4382static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4235 struct kvm_vcpu *vcpu)
4236{ 4383{
4237 struct kvm_segment kvm_seg; 4384 emul_to_vcpu(ctxt)->arch.halt_request = 1;
4385}
4238 4386
4239 kvm_get_segment(vcpu, &kvm_seg, seg); 4387static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4240 kvm_seg.selector = sel; 4388{
4241 kvm_set_segment(vcpu, &kvm_seg, seg); 4389 preempt_disable();
4390 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4391 /*
4392 * CR0.TS may reference the host fpu state, not the guest fpu state,
4393 * so it may be clear at this point.
4394 */
4395 clts();
4396}
4397
4398static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4399{
4400 preempt_enable();
4401}
4402
4403static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4404 struct x86_instruction_info *info,
4405 enum x86_intercept_stage stage)
4406{
4407 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4242} 4408}
4243 4409
4244static struct x86_emulate_ops emulate_ops = { 4410static struct x86_emulate_ops emulate_ops = {
@@ -4248,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = {
4248 .read_emulated = emulator_read_emulated, 4414 .read_emulated = emulator_read_emulated,
4249 .write_emulated = emulator_write_emulated, 4415 .write_emulated = emulator_write_emulated,
4250 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4416 .cmpxchg_emulated = emulator_cmpxchg_emulated,
4417 .invlpg = emulator_invlpg,
4251 .pio_in_emulated = emulator_pio_in_emulated, 4418 .pio_in_emulated = emulator_pio_in_emulated,
4252 .pio_out_emulated = emulator_pio_out_emulated, 4419 .pio_out_emulated = emulator_pio_out_emulated,
4253 .get_cached_descriptor = emulator_get_cached_descriptor, 4420 .get_segment = emulator_get_segment,
4254 .set_cached_descriptor = emulator_set_cached_descriptor, 4421 .set_segment = emulator_set_segment,
4255 .get_segment_selector = emulator_get_segment_selector,
4256 .set_segment_selector = emulator_set_segment_selector,
4257 .get_cached_segment_base = emulator_get_cached_segment_base, 4422 .get_cached_segment_base = emulator_get_cached_segment_base,
4258 .get_gdt = emulator_get_gdt, 4423 .get_gdt = emulator_get_gdt,
4259 .get_idt = emulator_get_idt, 4424 .get_idt = emulator_get_idt,
4425 .set_gdt = emulator_set_gdt,
4426 .set_idt = emulator_set_idt,
4260 .get_cr = emulator_get_cr, 4427 .get_cr = emulator_get_cr,
4261 .set_cr = emulator_set_cr, 4428 .set_cr = emulator_set_cr,
4262 .cpl = emulator_get_cpl, 4429 .cpl = emulator_get_cpl,
4263 .get_dr = emulator_get_dr, 4430 .get_dr = emulator_get_dr,
4264 .set_dr = emulator_set_dr, 4431 .set_dr = emulator_set_dr,
4265 .set_msr = kvm_set_msr, 4432 .set_msr = emulator_set_msr,
4266 .get_msr = kvm_get_msr, 4433 .get_msr = emulator_get_msr,
4434 .halt = emulator_halt,
4435 .wbinvd = emulator_wbinvd,
4436 .fix_hypercall = emulator_fix_hypercall,
4437 .get_fpu = emulator_get_fpu,
4438 .put_fpu = emulator_put_fpu,
4439 .intercept = emulator_intercept,
4267}; 4440};
4268 4441
4269static void cache_all_regs(struct kvm_vcpu *vcpu) 4442static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4305,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4305 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4306 int cs_db, cs_l; 4479 int cs_db, cs_l;
4307 4480
4481 /*
4482 * TODO: fix emulate.c to use guest_read/write_register
4483 * instead of direct ->regs accesses, can save hundred cycles
4484 * on Intel for instructions that don't read/change RSP, for
4485 * for example.
4486 */
4308 cache_all_regs(vcpu); 4487 cache_all_regs(vcpu);
4309 4488
4310 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4311 4490
4312 vcpu->arch.emulate_ctxt.vcpu = vcpu; 4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
4313 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4314 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4315 vcpu->arch.emulate_ctxt.mode = 4493 vcpu->arch.emulate_ctxt.mode =
4316 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
@@ -4318,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4318 ? X86EMUL_MODE_VM86 : cs_l 4496 ? X86EMUL_MODE_VM86 : cs_l
4319 ? X86EMUL_MODE_PROT64 : cs_db 4497 ? X86EMUL_MODE_PROT64 : cs_db
4320 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
4321 memset(c, 0, sizeof(struct decode_cache)); 4500 memset(c, 0, sizeof(struct decode_cache));
4322 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4323} 4503}
4324 4504
4325int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) 4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4326{ 4506{
4327 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4328 int ret; 4508 int ret;
@@ -4331,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4331 4511
4332 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4333 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4334 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; 4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
4515 inc_eip;
4335 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); 4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4336 4517
4337 if (ret != X86EMUL_CONTINUE) 4518 if (ret != X86EMUL_CONTINUE)
@@ -4340,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4340 vcpu->arch.emulate_ctxt.eip = c->eip; 4521 vcpu->arch.emulate_ctxt.eip = c->eip;
4341 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4342 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4343 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4344 4525
4345 if (irq == NMI_VECTOR) 4526 if (irq == NMI_VECTOR)
4346 vcpu->arch.nmi_pending = false; 4527 vcpu->arch.nmi_pending = false;
@@ -4402,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4402{ 4583{
4403 int r; 4584 int r;
4404 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4586 bool writeback = true;
4405 4587
4406 kvm_clear_exception_queue(vcpu); 4588 kvm_clear_exception_queue(vcpu);
4407 vcpu->arch.mmio_fault_cr2 = cr2;
4408 /*
4409 * TODO: fix emulate.c to use guest_read/write_register
4410 * instead of direct ->regs accesses, can save hundred cycles
4411 * on Intel for instructions that don't read/change RSP, for
4412 * for example.
4413 */
4414 cache_all_regs(vcpu);
4415 4589
4416 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4417 init_emulate_ctxt(vcpu); 4591 init_emulate_ctxt(vcpu);
@@ -4442,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4442 return EMULATE_DONE; 4616 return EMULATE_DONE;
4443 } 4617 }
4444 4618
4445 /* this is needed for vmware backdor interface to work since it 4619 /* this is needed for vmware backdoor interface to work since it
4446 changes registers values during IO operation */ 4620 changes registers values during IO operation */
4447 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4624 }
4448 4625
4449restart: 4626restart:
4450 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4451 4628
4629 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE;
4631
4452 if (r == EMULATION_FAILED) { 4632 if (r == EMULATION_FAILED) {
4453 if (reexecute_instruction(vcpu, cr2)) 4633 if (reexecute_instruction(vcpu, cr2))
4454 return EMULATE_DONE; 4634 return EMULATE_DONE;
@@ -4462,21 +4642,28 @@ restart:
4462 } else if (vcpu->arch.pio.count) { 4642 } else if (vcpu->arch.pio.count) {
4463 if (!vcpu->arch.pio.in) 4643 if (!vcpu->arch.pio.in)
4464 vcpu->arch.pio.count = 0; 4644 vcpu->arch.pio.count = 0;
4645 else
4646 writeback = false;
4465 r = EMULATE_DO_MMIO; 4647 r = EMULATE_DO_MMIO;
4466 } else if (vcpu->mmio_needed) { 4648 } else if (vcpu->mmio_needed) {
4467 if (vcpu->mmio_is_write) 4649 if (!vcpu->mmio_is_write)
4468 vcpu->mmio_needed = 0; 4650 writeback = false;
4469 r = EMULATE_DO_MMIO; 4651 r = EMULATE_DO_MMIO;
4470 } else if (r == EMULATION_RESTART) 4652 } else if (r == EMULATION_RESTART)
4471 goto restart; 4653 goto restart;
4472 else 4654 else
4473 r = EMULATE_DONE; 4655 r = EMULATE_DONE;
4474 4656
4475 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4657 if (writeback) {
4476 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4658 toggle_interruptibility(vcpu,
4477 kvm_make_request(KVM_REQ_EVENT, vcpu); 4659 vcpu->arch.emulate_ctxt.interruptibility);
4478 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4479 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4661 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4665 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4480 4667
4481 return r; 4668 return r;
4482} 4669}
@@ -4485,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4485int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4672int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4486{ 4673{
4487 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4674 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4488 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4675 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4676 size, port, &val, 1);
4489 /* do not return to emulator after return from userspace */ 4677 /* do not return to emulator after return from userspace */
4490 vcpu->arch.pio.count = 0; 4678 vcpu->arch.pio.count = 0;
4491 return ret; 4679 return ret;
@@ -4879,8 +5067,9 @@ out:
4879} 5067}
4880EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5068EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
4881 5069
4882int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 5070int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
4883{ 5071{
5072 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4884 char instruction[3]; 5073 char instruction[3];
4885 unsigned long rip = kvm_rip_read(vcpu); 5074 unsigned long rip = kvm_rip_read(vcpu);
4886 5075
@@ -4893,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4893 5082
4894 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5083 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4895 5084
4896 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
4897} 5086 rip, instruction, 3, NULL);
4898
4899void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4900{
4901 struct desc_ptr dt = { limit, base };
4902
4903 kvm_x86_ops->set_gdt(vcpu, &dt);
4904}
4905
4906void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4907{
4908 struct desc_ptr dt = { limit, base };
4909
4910 kvm_x86_ops->set_idt(vcpu, &dt);
4911} 5087}
4912 5088
4913static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5170,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5170static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5346static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5171{ 5347{
5172 int r; 5348 int r;
5349 bool nmi_pending;
5173 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5350 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5174 vcpu->run->request_interrupt_window; 5351 vcpu->run->request_interrupt_window;
5175 5352
@@ -5207,19 +5384,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5207 r = 1; 5384 r = 1;
5208 goto out; 5385 goto out;
5209 } 5386 }
5210 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5211 vcpu->arch.nmi_pending = true;
5212 } 5387 }
5213 5388
5214 r = kvm_mmu_reload(vcpu); 5389 r = kvm_mmu_reload(vcpu);
5215 if (unlikely(r)) 5390 if (unlikely(r))
5216 goto out; 5391 goto out;
5217 5392
5393 /*
5394 * An NMI can be injected between local nmi_pending read and
5395 * vcpu->arch.nmi_pending read inside inject_pending_event().
5396 * But in that case, KVM_REQ_EVENT will be set, which makes
5397 * the race described above benign.
5398 */
5399 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5400
5218 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5401 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5219 inject_pending_event(vcpu); 5402 inject_pending_event(vcpu);
5220 5403
5221 /* enable NMI/IRQ window open exits if needed */ 5404 /* enable NMI/IRQ window open exits if needed */
5222 if (vcpu->arch.nmi_pending) 5405 if (nmi_pending)
5223 kvm_x86_ops->enable_nmi_window(vcpu); 5406 kvm_x86_ops->enable_nmi_window(vcpu);
5224 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5407 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5225 kvm_x86_ops->enable_irq_window(vcpu); 5408 kvm_x86_ops->enable_irq_window(vcpu);
@@ -5399,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5399 return r; 5582 return r;
5400} 5583}
5401 5584
5585static int complete_mmio(struct kvm_vcpu *vcpu)
5586{
5587 struct kvm_run *run = vcpu->run;
5588 int r;
5589
5590 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5591 return 1;
5592
5593 if (vcpu->mmio_needed) {
5594 vcpu->mmio_needed = 0;
5595 if (!vcpu->mmio_is_write)
5596 memcpy(vcpu->mmio_data + vcpu->mmio_index,
5597 run->mmio.data, 8);
5598 vcpu->mmio_index += 8;
5599 if (vcpu->mmio_index < vcpu->mmio_size) {
5600 run->exit_reason = KVM_EXIT_MMIO;
5601 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
5602 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
5603 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5604 run->mmio.is_write = vcpu->mmio_is_write;
5605 vcpu->mmio_needed = 1;
5606 return 0;
5607 }
5608 if (vcpu->mmio_is_write)
5609 return 1;
5610 vcpu->mmio_read_completed = 1;
5611 }
5612 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5613 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5614 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5615 if (r != EMULATE_DONE)
5616 return 0;
5617 return 1;
5618}
5619
5402int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5620int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5403{ 5621{
5404 int r; 5622 int r;
@@ -5425,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5425 } 5643 }
5426 } 5644 }
5427 5645
5428 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5646 r = complete_mmio(vcpu);
5429 if (vcpu->mmio_needed) { 5647 if (r <= 0)
5430 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5648 goto out;
5431 vcpu->mmio_read_completed = 1; 5649
5432 vcpu->mmio_needed = 0;
5433 }
5434 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5435 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5436 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5437 if (r != EMULATE_DONE) {
5438 r = 0;
5439 goto out;
5440 }
5441 }
5442 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 5650 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
5443 kvm_register_write(vcpu, VCPU_REGS_RAX, 5651 kvm_register_write(vcpu, VCPU_REGS_RAX,
5444 kvm_run->hypercall.ret); 5652 kvm_run->hypercall.ret);
@@ -5455,6 +5663,18 @@ out:
5455 5663
5456int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5664int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5457{ 5665{
5666 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5667 /*
5668 * We are here if userspace calls get_regs() in the middle of
5669 * instruction emulation. Registers state needs to be copied
5670 * back from emulation context to vcpu. Usrapace shouldn't do
5671 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work
5673 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 }
5458 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5459 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5679 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5460 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5680 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5482,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5482 5702
5483int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5703int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5484{ 5704{
5705 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
5706 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5707
5485 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 5708 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
5486 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 5709 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
5487 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 5710 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5592,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5592 5815
5593 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5594 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5595 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5596 kvm_make_request(KVM_REQ_EVENT, vcpu); 5819 kvm_make_request(KVM_REQ_EVENT, vcpu);
5597 return EMULATE_DONE; 5820 return EMULATE_DONE;
5598} 5821}
@@ -5974,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5974 } 6197 }
5975 vcpu->arch.pio_data = page_address(page); 6198 vcpu->arch.pio_data = page_address(page);
5976 6199
5977 if (!kvm->arch.virtual_tsc_khz) 6200 kvm_init_tsc_catchup(vcpu, max_tsc_khz);
5978 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5979 6201
5980 r = kvm_mmu_create(vcpu); 6202 r = kvm_mmu_create(vcpu);
5981 if (r < 0) 6203 if (r < 0)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c600da830ce0..e407ed3df817 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -77,7 +77,7 @@ static inline u32 bit(int bitno)
77 77
78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); 80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81 81
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
83 83
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce5..e191c096ab90 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
7 * kernel and insert a module (lg.ko) which allows us to run other Linux 7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host, 8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests 9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the 10 * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
11 * Launcher. 11 * Launcher.
12 * 12 *
13 * Secondly, we only run specially modified Guests, not normal kernels: setting 13 * Secondly, we only run specially modified Guests, not normal kernels: setting
@@ -913,8 +913,6 @@ static struct clocksource lguest_clock = {
913 .rating = 200, 913 .rating = 200,
914 .read = lguest_clock_read, 914 .read = lguest_clock_read,
915 .mask = CLOCKSOURCE_MASK(64), 915 .mask = CLOCKSOURCE_MASK(64),
916 .mult = 1 << 22,
917 .shift = 22,
918 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 916 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
919}; 917};
920 918
@@ -997,7 +995,7 @@ static void lguest_time_init(void)
997 /* Set up the timer interrupt (0) to go to our simple timer routine */ 995 /* Set up the timer interrupt (0) to go to our simple timer routine */
998 irq_set_handler(0, lguest_time_irq); 996 irq_set_handler(0, lguest_time_irq);
999 997
1000 clocksource_register(&lguest_clock); 998 clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
1001 999
1002 /* We can't set cpumask in the initializer: damn C limitations! Set it 1000 /* We can't set cpumask in the initializer: damn C limitations! Set it
1003 * here and register our timer device. */ 1001 * here and register our timer device. */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24a..f2145cfa12a6 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/alternative-asm.h>
3 4
4/* 5/*
5 * Zero a page. 6 * Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
14 CFI_ENDPROC 15 CFI_ENDPROC
15ENDPROC(clear_page_c) 16ENDPROC(clear_page_c)
16 17
18ENTRY(clear_page_c_e)
19 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26
17ENTRY(clear_page) 27ENTRY(clear_page)
18 CFI_STARTPROC 28 CFI_STARTPROC
19 xorl %eax,%eax 29 xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
38.Lclear_page_end: 48.Lclear_page_end:
39ENDPROC(clear_page) 49ENDPROC(clear_page)
40 50
41 /* Some CPUs run faster using the string instructions. 51 /*
42 It is also a lot simpler. Use this when possible */ 52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
43 58
44#include <asm/cpufeature.h> 59#include <asm/cpufeature.h>
45 60
46 .section .altinstr_replacement,"ax" 61 .section .altinstr_replacement,"ax"
471: .byte 0xeb /* jmp <disp8> */ 621: .byte 0xeb /* jmp <disp8> */
48 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
492: 642: .byte 0xeb /* jmp <disp8> */
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
663:
50 .previous 67 .previous
51 .section .altinstructions,"a" 68 .section .altinstructions,"a"
52 .align 8 69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
53 .quad clear_page 70 .Lclear_page_end-clear_page, 2b-1b
54 .quad 1b 71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
55 .word X86_FEATURE_REP_GOOD 72 .Lclear_page_end-clear_page,3b-2b
56 .byte .Lclear_page_end - clear_page
57 .byte 2b - 1b
58 .previous 73 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e482615195..024840266ba0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
15#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/cpufeature.h> 17#include <asm/cpufeature.h>
18#include <asm/alternative-asm.h>
18 19
19 .macro ALTERNATIVE_JUMP feature,orig,alt 20/*
21 * By placing feature2 after feature1 in altinstructions section, we logically
22 * implement:
23 * If CPU has feature2, jmp to alt2 is used
24 * else if CPU has feature1, jmp to alt1 is used
25 * else jmp to orig is used.
26 */
27 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
200: 280:
21 .byte 0xe9 /* 32bit jump */ 29 .byte 0xe9 /* 32bit jump */
22 .long \orig-1f /* by default jump to orig */ 30 .long \orig-1f /* by default jump to orig */
231: 311:
24 .section .altinstr_replacement,"ax" 32 .section .altinstr_replacement,"ax"
252: .byte 0xe9 /* near jump with 32bit immediate */ 332: .byte 0xe9 /* near jump with 32bit immediate */
26 .long \alt-1b /* offset */ /* or alternatively to alt */ 34 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
353: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
27 .previous 37 .previous
38
28 .section .altinstructions,"a" 39 .section .altinstructions,"a"
29 .align 8 40 altinstruction_entry 0b,2b,\feature1,5,5
30 .quad 0b 41 altinstruction_entry 0b,3b,\feature2,5,5
31 .quad 2b
32 .word \feature /* when feature is set */
33 .byte 5
34 .byte 5
35 .previous 42 .previous
36 .endm 43 .endm
37 44
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
72 addq %rdx,%rcx 79 addq %rdx,%rcx
73 jc bad_to_user 80 jc bad_to_user
74 cmpq TI_addr_limit(%rax),%rcx 81 cmpq TI_addr_limit(%rax),%rcx
75 jae bad_to_user 82 ja bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 83 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
84 copy_user_generic_unrolled,copy_user_generic_string, \
85 copy_user_enhanced_fast_string
77 CFI_ENDPROC 86 CFI_ENDPROC
78ENDPROC(_copy_to_user) 87ENDPROC(_copy_to_user)
79 88
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
85 addq %rdx,%rcx 94 addq %rdx,%rcx
86 jc bad_from_user 95 jc bad_from_user
87 cmpq TI_addr_limit(%rax),%rcx 96 cmpq TI_addr_limit(%rax),%rcx
88 jae bad_from_user 97 ja bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 98 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
99 copy_user_generic_unrolled,copy_user_generic_string, \
100 copy_user_enhanced_fast_string
90 CFI_ENDPROC 101 CFI_ENDPROC
91ENDPROC(_copy_from_user) 102ENDPROC(_copy_from_user)
92 103
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
255 .previous 266 .previous
256 CFI_ENDPROC 267 CFI_ENDPROC
257ENDPROC(copy_user_generic_string) 268ENDPROC(copy_user_generic_string)
269
270/*
271 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
272 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
273 *
274 * Input:
275 * rdi destination
276 * rsi source
277 * rdx count
278 *
279 * Output:
280 * eax uncopied bytes or 0 if successful.
281 */
282ENTRY(copy_user_enhanced_fast_string)
283 CFI_STARTPROC
284 andl %edx,%edx
285 jz 2f
286 movl %edx,%ecx
2871: rep
288 movsb
2892: xorl %eax,%eax
290 ret
291
292 .section .fixup,"ax"
29312: movl %ecx,%edx /* ecx is zerorest also */
294 jmp copy_user_handle_tail
295 .previous
296
297 .section __ex_table,"a"
298 .align 8
299 .quad 1b,12b
300 .previous
301 CFI_ENDPROC
302ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e38..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
4 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 6#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h>
7 8
8/* 9/*
9 * memcpy - Copy a memory block. 10 * memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
37.Lmemcpy_e: 38.Lmemcpy_e:
38 .previous 39 .previous
39 40
41/*
42 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
43 * memcpy_c. Use memcpy_c_e when possible.
44 *
45 * This gets patched over the unrolled variant (below) via the
46 * alternative instructions framework:
47 */
48 .section .altinstr_replacement, "ax", @progbits
49.Lmemcpy_c_e:
50 movq %rdi, %rax
51
52 movl %edx, %ecx
53 rep movsb
54 ret
55.Lmemcpy_e_e:
56 .previous
57
40ENTRY(__memcpy) 58ENTRY(__memcpy)
41ENTRY(memcpy) 59ENTRY(memcpy)
42 CFI_STARTPROC 60 CFI_STARTPROC
@@ -49,7 +67,7 @@ ENTRY(memcpy)
49 jb .Lhandle_tail 67 jb .Lhandle_tail
50 68
51 /* 69 /*
52 * We check whether memory false dependece could occur, 70 * We check whether memory false dependence could occur,
53 * then jump to corresponding copy mode. 71 * then jump to corresponding copy mode.
54 */ 72 */
55 cmp %dil, %sil 73 cmp %dil, %sil
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
171ENDPROC(__memcpy) 189ENDPROC(__memcpy)
172 190
173 /* 191 /*
174 * Some CPUs run faster using the string copy instructions. 192 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
175 * It is also a lot simpler. Use this when possible: 193 * If the feature is supported, memcpy_c_e() is the first choice.
176 */ 194 * If enhanced rep movsb copy is not available, use fast string copy
177 195 * memcpy_c() when possible. This is faster and code is simpler than
178 .section .altinstructions, "a" 196 * original memcpy().
179 .align 8 197 * Otherwise, original memcpy() is used.
180 .quad memcpy 198 * In .altinstructions section, ERMS feature is placed after REG_GOOD
181 .quad .Lmemcpy_c 199 * feature to implement the right patch order.
182 .word X86_FEATURE_REP_GOOD 200 *
183
184 /*
185 * Replace only beginning, memcpy is used to apply alternatives, 201 * Replace only beginning, memcpy is used to apply alternatives,
186 * so it is silly to overwrite itself with nops - reboot is the 202 * so it is silly to overwrite itself with nops - reboot is the
187 * only outcome... 203 * only outcome...
188 */ 204 */
189 .byte .Lmemcpy_e - .Lmemcpy_c 205 .section .altinstructions, "a"
190 .byte .Lmemcpy_e - .Lmemcpy_c 206 altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
207 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
208 altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
209 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
191 .previous 210 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a8..d0ec9c2936d7 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
8#define _STRING_C 8#define _STRING_C
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11#include <asm/cpufeature.h>
11 12
12#undef memmove 13#undef memmove
13 14
@@ -24,6 +25,7 @@
24 */ 25 */
25ENTRY(memmove) 26ENTRY(memmove)
26 CFI_STARTPROC 27 CFI_STARTPROC
28
27 /* Handle more 32bytes in loop */ 29 /* Handle more 32bytes in loop */
28 mov %rdi, %rax 30 mov %rdi, %rax
29 cmp $0x20, %rdx 31 cmp $0x20, %rdx
@@ -31,8 +33,13 @@ ENTRY(memmove)
31 33
32 /* Decide forward/backward copy mode */ 34 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi 35 cmp %rdi, %rsi
34 jb 2f 36 jge .Lmemmove_begin_forward
37 mov %rsi, %r8
38 add %rdx, %r8
39 cmp %rdi, %r8
40 jg 2f
35 41
42.Lmemmove_begin_forward:
36 /* 43 /*
37 * movsq instruction have many startup latency 44 * movsq instruction have many startup latency
38 * so we handle small size by general register. 45 * so we handle small size by general register.
@@ -78,6 +85,8 @@ ENTRY(memmove)
78 rep movsq 85 rep movsq
79 movq %r11, (%r10) 86 movq %r11, (%r10)
80 jmp 13f 87 jmp 13f
88.Lmemmove_end_forward:
89
81 /* 90 /*
82 * Handle data backward by movsq. 91 * Handle data backward by movsq.
83 */ 92 */
@@ -194,4 +203,22 @@ ENTRY(memmove)
19413: 20313:
195 retq 204 retq
196 CFI_ENDPROC 205 CFI_ENDPROC
206
207 .section .altinstr_replacement,"ax"
208.Lmemmove_begin_forward_efs:
209 /* Forward moving data. */
210 movq %rdx, %rcx
211 rep movsb
212 retq
213.Lmemmove_end_forward_efs:
214 .previous
215
216 .section .altinstructions,"a"
217 .align 8
218 .quad .Lmemmove_begin_forward
219 .quad .Lmemmove_begin_forward_efs
220 .word X86_FEATURE_ERMS
221 .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
222 .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
223 .previous
197ENDPROC(memmove) 224ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d344269652..79bd454b78a3 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h>
5 7
6/* 8/*
7 * ISO C memset - set a memory block to a byte value. 9 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well.
8 * 12 *
9 * rdi destination 13 * rdi destination
10 * rsi value (char) 14 * rsi value (char)
@@ -31,6 +35,28 @@
31.Lmemset_e: 35.Lmemset_e:
32 .previous 36 .previous
33 37
38/*
39 * ISO C memset - set a memory block to a byte value. This function uses
40 * enhanced rep stosb to override the fast string function.
41 * The code is simpler and shorter than the fast string function as well.
42 *
43 * rdi destination
44 * rsi value (char)
45 * rdx count (bytes)
46 *
47 * rax original destination
48 */
49 .section .altinstr_replacement, "ax", @progbits
50.Lmemset_c_e:
51 movq %rdi,%r9
52 movb %sil,%al
53 movl %edx,%ecx
54 rep stosb
55 movq %r9,%rax
56 ret
57.Lmemset_e_e:
58 .previous
59
34ENTRY(memset) 60ENTRY(memset)
35ENTRY(__memset) 61ENTRY(__memset)
36 CFI_STARTPROC 62 CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
112ENDPROC(memset) 138ENDPROC(memset)
113ENDPROC(__memset) 139ENDPROC(__memset)
114 140
115 /* Some CPUs run faster using the string instructions. 141 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
116 It is also a lot simpler. Use this when possible */ 142 * It is recommended to use this when possible.
117 143 *
118#include <asm/cpufeature.h> 144 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
119 145 * instructions.
146 *
147 * Otherwise, use original memset function.
148 *
149 * In .altinstructions section, ERMS feature is placed after REG_GOOD
150 * feature to implement the right patch order.
151 */
120 .section .altinstructions,"a" 152 .section .altinstructions,"a"
121 .align 8 153 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
122 .quad memset 154 .Lfinal-memset,.Lmemset_e-.Lmemset_c
123 .quad .Lmemset_c 155 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
124 .word X86_FEATURE_REP_GOOD 156 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
125 .byte .Lfinal - memset
126 .byte .Lmemset_e - .Lmemset_c
127 .previous 157 .previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf9958..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29 29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c
index 0919c26820d4..5247d01329ca 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h> 14#include <linux/memblock.h>
15#include <linux/bootmem.h>
15 16
16#include <asm/io.h> 17#include <asm/io.h>
17#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
69 70
70int __init amd_numa_init(void) 71int __init amd_numa_init(void)
71{ 72{
72 unsigned long start = PFN_PHYS(0); 73 u64 start = PFN_PHYS(0);
73 unsigned long end = PFN_PHYS(max_pfn); 74 u64 end = PFN_PHYS(max_pfn);
74 unsigned numnodes; 75 unsigned numnodes;
75 unsigned long prevbase; 76 u64 prevbase;
76 int i, j, nb; 77 int i, j, nb;
77 u32 nodeid, reg; 78 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base; 79 unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
95 96
96 prevbase = 0; 97 prevbase = 0;
97 for (i = 0; i < 8; i++) { 98 for (i = 0; i < 8; i++) {
98 unsigned long base, limit; 99 u64 base, limit;
99 100
100 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
101 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
107 continue; 108 continue;
108 } 109 }
109 if (nodeid >= numnodes) { 110 if (nodeid >= numnodes) {
110 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, 111 pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
111 base, limit); 112 base, limit);
112 continue; 113 continue;
113 } 114 }
114 115
115 if (!limit) { 116 if (!limit) {
116 pr_info("Skipping node entry %d (base %lx)\n", 117 pr_info("Skipping node entry %d (base %Lx)\n",
117 i, base); 118 i, base);
118 continue; 119 continue;
119 } 120 }
120 if ((base >> 8) & 3 || (limit >> 8) & 3) { 121 if ((base >> 8) & 3 || (limit >> 8) & 3) {
121 pr_err("Node %d using interleaving mode %lx/%lx\n", 122 pr_err("Node %d using interleaving mode %Lx/%Lx\n",
122 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 123 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
150 continue; 151 continue;
151 } 152 }
152 if (limit < base) { 153 if (limit < base) {
153 pr_err("Node %d bogus settings %lx-%lx.\n", 154 pr_err("Node %d bogus settings %Lx-%Lx.\n",
154 nodeid, base, limit); 155 nodeid, base, limit);
155 continue; 156 continue;
156 } 157 }
157 158
158 /* Could sort here, but pun for now. Should not happen anyroads. */ 159 /* Could sort here, but pun for now. Should not happen anyroads. */
159 if (prevbase > base) { 160 if (prevbase > base) {
160 pr_err("Node map not sorted %lx,%lx\n", 161 pr_err("Node map not sorted %Lx,%Lx\n",
161 prevbase, base); 162 prevbase, base);
162 return -EINVAL; 163 return -EINVAL;
163 } 164 }
164 165
165 pr_info("Node %d MemBase %016lx Limit %016lx\n", 166 pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
166 nodeid, base, limit); 167 nodeid, base, limit);
167 168
168 prevbase = base; 169 prevbase = base;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1e..f7a2a054a3c0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */ 14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */
15 16
16#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
17#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -964,7 +965,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
964 struct mm_struct *mm; 965 struct mm_struct *mm;
965 int fault; 966 int fault;
966 int write = error_code & PF_WRITE; 967 int write = error_code & PF_WRITE;
967 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | 968 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
968 (write ? FAULT_FLAG_WRITE : 0); 969 (write ? FAULT_FLAG_WRITE : 0);
969 970
970 tsk = current; 971 tsk = current;
@@ -1138,6 +1139,16 @@ good_area:
1138 } 1139 }
1139 1140
1140 /* 1141 /*
1142 * Pagefault was interrupted by SIGKILL. We have no reason to
1143 * continue pagefault.
1144 */
1145 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
1146 if (!(error_code & PF_USER))
1147 no_context(regs, error_code, address);
1148 return;
1149 }
1150
1151 /*
1141 * Major/minor page fault accounting is only done on the 1152 * Major/minor page fault accounting is only done on the
1142 * initial attempt. If we go through a retry, it is extremely 1153 * initial attempt. If we go through a retry, it is extremely
1143 * likely that the page will be found in page cache at that point. 1154 * likely that the page will be found in page cache at that point.
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index d4203988504a..f581a18c0d4d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
72 if (!vma_shareable(vma, addr)) 72 if (!vma_shareable(vma, addr))
73 return; 73 return;
74 74
75 spin_lock(&mapping->i_mmap_lock); 75 mutex_lock(&mapping->i_mmap_mutex);
76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
77 if (svma == vma) 77 if (svma == vma)
78 continue; 78 continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
97 put_page(virt_to_page(spte)); 97 put_page(virt_to_page(spte));
98 spin_unlock(&mm->page_table_lock); 98 spin_unlock(&mm->page_table_lock);
99out: 99out:
100 spin_unlock(&mapping->i_mmap_lock); 100 mutex_unlock(&mapping->i_mmap_mutex);
101} 101}
102 102
103/* 103/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289b039b..30326443ab81 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,8 +16,6 @@
16#include <asm/tlb.h> 16#include <asm/tlb.h>
17#include <asm/proto.h> 17#include <asm/proto.h>
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20
21unsigned long __initdata pgt_buf_start; 19unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata pgt_buf_end; 20unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata pgt_buf_top; 21unsigned long __meminitdata pgt_buf_top;
@@ -81,6 +79,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); 79 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
82} 80}
83 81
82void __init native_pagetable_reserve(u64 start, u64 end)
83{
84 memblock_x86_reserve_range(start, end, "PGTABLE");
85}
86
84struct map_range { 87struct map_range {
85 unsigned long start; 88 unsigned long start;
86 unsigned long end; 89 unsigned long end;
@@ -272,9 +275,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
272 275
273 __flush_tlb_all(); 276 __flush_tlb_all();
274 277
278 /*
279 * Reserve the kernel pagetable pages we used (pgt_buf_start -
280 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
281 * so that they can be reused for other purposes.
282 *
283 * On native it just means calling memblock_x86_reserve_range, on Xen it
284 * also means marking RW the pagetable pages that we allocated before
285 * but that haven't been used.
286 *
287 * In fact on xen we mark RO the whole range pgt_buf_start -
288 * pgt_buf_top, because we have to make sure that when
289 * init_memory_mapping reaches the pagetable pages area, it maps
290 * RO all the pagetable pages, including the ones that are beyond
291 * pgt_buf_end at that time.
292 */
275 if (!after_bootmem && pgt_buf_end > pgt_buf_start) 293 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, 294 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
277 pgt_buf_end << PAGE_SHIFT, "PGTABLE"); 295 PFN_PHYS(pgt_buf_end));
278 296
279 if (!after_bootmem) 297 if (!after_bootmem)
280 early_memtest(start, end); 298 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f994193..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
678{ 678{
679 unsigned long max_zone_pfns[MAX_NR_ZONES]; 679 unsigned long max_zone_pfns[MAX_NR_ZONES];
680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681#ifdef CONFIG_ZONE_DMA
681 max_zone_pfns[ZONE_DMA] = 682 max_zone_pfns[ZONE_DMA] =
682 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
684#endif
683 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
684#ifdef CONFIG_HIGHMEM 686#ifdef CONFIG_HIGHMEM
685 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -716,6 +718,7 @@ void __init paging_init(void)
716 * NOTE: at this point the bootmem allocator is fully available. 718 * NOTE: at this point the bootmem allocator is fully available.
717 */ 719 */
718 olpc_dt_build_devicetree(); 720 olpc_dt_build_devicetree();
721 sparse_memory_present_with_active_regions(MAX_NUMNODES);
719 sparse_init(); 722 sparse_init();
720 zone_sizes_init(); 723 zone_sizes_init();
721} 724}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 794233587287..d865c4aeec55 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -616,7 +616,9 @@ void __init paging_init(void)
616 unsigned long max_zone_pfns[MAX_NR_ZONES]; 616 unsigned long max_zone_pfns[MAX_NR_ZONES];
617 617
618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
619#ifdef CONFIG_ZONE_DMA
619 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 620 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
621#endif
620 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 622 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
621 max_zone_pfns[ZONE_NORMAL] = max_pfn; 623 max_zone_pfns[ZONE_NORMAL] = max_pfn;
622 624
@@ -679,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
679} 681}
680EXPORT_SYMBOL_GPL(arch_add_memory); 682EXPORT_SYMBOL_GPL(arch_add_memory);
681 683
682#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
683int memory_add_physaddr_to_nid(u64 start)
684{
685 return 0;
686}
687EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
688#endif
689
690#endif /* CONFIG_MEMORY_HOTPLUG */ 684#endif /* CONFIG_MEMORY_HOTPLUG */
691 685
692static struct kcore_list kcore_vsyscall; 686static struct kcore_list kcore_vsyscall;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511dc..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
91 return (__force void __iomem *)phys_to_virt(phys_addr); 91 return (__force void __iomem *)phys_to_virt(phys_addr);
92 92
93 /* 93 /*
94 * Check if the request spans more than any BAR in the iomem resource
95 * tree.
96 */
97 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
98 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
99
100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 94 * Don't allow anybody to remap normal RAM that we're using..
102 */ 95 */
103 last_pfn = last_addr >> PAGE_SHIFT; 96 last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 ret_addr = (void __iomem *) (vaddr + offset); 163 ret_addr = (void __iomem *) (vaddr + offset);
171 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 164 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
172 165
166 /*
167 * Check if the request spans more than any BAR in the iomem resource
168 * tree.
169 */
170 WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
171 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
172
173 return ret_addr; 173 return ret_addr;
174err_free_area: 174err_free_area:
175 free_vm_area(area); 175 free_vm_area(area);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 9559d360fde7..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,11 +1,39 @@
1/* Common code for 32 and 64-bit NUMA */ 1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h> 2#include <linux/kernel.h>
3#include <linux/module.h> 3#include <linux/mm.h>
4#include <linux/string.h>
5#include <linux/init.h>
4#include <linux/bootmem.h> 6#include <linux/bootmem.h>
5#include <asm/numa.h> 7#include <linux/memblock.h>
8#include <linux/mmzone.h>
9#include <linux/ctype.h>
10#include <linux/module.h>
11#include <linux/nodemask.h>
12#include <linux/sched.h>
13#include <linux/topology.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
6#include <asm/acpi.h> 18#include <asm/acpi.h>
19#include <asm/amd_nb.h>
20
21#include "numa_internal.h"
7 22
8int __initdata numa_off; 23int __initdata numa_off;
24nodemask_t numa_nodes_parsed __initdata;
25
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29static struct numa_meminfo numa_meminfo
30#ifndef CONFIG_MEMORY_HOTPLUG
31__initdata
32#endif
33;
34
35static int numa_distance_cnt;
36static u8 *numa_distance;
9 37
10static __init int numa_setup(char *opt) 38static __init int numa_setup(char *opt)
11{ 39{
@@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33}; 61};
34 62
63int __cpuinit numa_cpu_node(int cpu)
64{
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66
67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE;
70}
71
35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
36EXPORT_SYMBOL(node_to_cpumask_map); 73EXPORT_SYMBOL(node_to_cpumask_map);
37 74
@@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void)
95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
96} 133}
97 134
135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk
181 *
182 * Add a new memblk to the default numa_meminfo.
183 *
184 * RETURNS:
185 * 0 on success, -errno on failure.
186 */
187int __init numa_add_memblk(int nid, u64 start, u64 end)
188{
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190}
191
192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end)
194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false;
199 u64 nd_pa;
200 void *nd;
201 int tnid;
202
203 /*
204 * Don't confuse VM with a node that doesn't have the
205 * minimum amount of memory:
206 */
207 if (end && (end - start) < NODE_MIN_SIZE)
208 return;
209
210 /* initialize remap allocator before aligning to ZONE_ALIGN */
211 init_alloc_remap(nid, start, end);
212
213 start = roundup(start, ZONE_ALIGN);
214
215 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
216 nid, start, end);
217
218 /*
219 * Allocate node data. Try remap allocator first, node-local
220 * memory and then any node. Never allocate in DMA zone.
221 */
222 nd = alloc_remap(nid, nd_size);
223 if (nd) {
224 nd_pa = __pa(nd);
225 remapped = true;
226 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
228 nd_size, SMP_CACHE_BYTES);
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid);
235 return;
236 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa);
239 }
240
241 /* report and initialize */
242 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
243 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
244 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
245 if (!remapped && tnid != nid)
246 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
247
248 node_data[nid] = nd;
249 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
250 NODE_DATA(nid)->node_id = nid;
251 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
252 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
253
254 node_set_online(nid);
255}
256
257/**
258 * numa_cleanup_meminfo - Cleanup a numa_meminfo
259 * @mi: numa_meminfo to clean up
260 *
261 * Sanitize @mi by merging and removing unncessary memblks. Also check for
262 * conflicts and clear unused memblks.
263 *
264 * RETURNS:
265 * 0 on success, -errno on failure.
266 */
267int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
268{
269 const u64 low = 0;
270 const u64 high = PFN_PHYS(max_pfn);
271 int i, j, k;
272
273 /* first, trim all entries */
274 for (i = 0; i < mi->nr_blks; i++) {
275 struct numa_memblk *bi = &mi->blk[i];
276
277 /* make sure all blocks are inside the limits */
278 bi->start = max(bi->start, low);
279 bi->end = min(bi->end, high);
280
281 /* and there's no empty block */
282 if (bi->start >= bi->end)
283 numa_remove_memblk_from(i--, mi);
284 }
285
286 /* merge neighboring / overlapping entries */
287 for (i = 0; i < mi->nr_blks; i++) {
288 struct numa_memblk *bi = &mi->blk[i];
289
290 for (j = i + 1; j < mi->nr_blks; j++) {
291 struct numa_memblk *bj = &mi->blk[j];
292 u64 start, end;
293
294 /*
295 * See whether there are overlapping blocks. Whine
296 * about but allow overlaps of the same nid. They
297 * will be merged below.
298 */
299 if (bi->end > bj->start && bi->start < bj->end) {
300 if (bi->nid != bj->nid) {
301 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
302 bi->nid, bi->start, bi->end,
303 bj->nid, bj->start, bj->end);
304 return -EINVAL;
305 }
306 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
307 bi->nid, bi->start, bi->end,
308 bj->start, bj->end);
309 }
310
311 /*
312 * Join together blocks on the same node, holes
313 * between which don't overlap with memory on other
314 * nodes.
315 */
316 if (bi->nid != bj->nid)
317 continue;
318 start = min(bi->start, bj->start);
319 end = max(bi->end, bj->end);
320 for (k = 0; k < mi->nr_blks; k++) {
321 struct numa_memblk *bk = &mi->blk[k];
322
323 if (bi->nid == bk->nid)
324 continue;
325 if (start < bk->end && end > bk->start)
326 break;
327 }
328 if (k < mi->nr_blks)
329 continue;
330 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
331 bi->nid, bi->start, bi->end, bj->start, bj->end,
332 start, end);
333 bi->start = start;
334 bi->end = end;
335 numa_remove_memblk_from(j--, mi);
336 }
337 }
338
339 /* clear unused ones */
340 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
341 mi->blk[i].start = mi->blk[i].end = 0;
342 mi->blk[i].nid = NUMA_NO_NODE;
343 }
344
345 return 0;
346}
347
348/*
349 * Set nodes, which have memory in @mi, in *@nodemask.
350 */
351static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
352 const struct numa_meminfo *mi)
353{
354 int i;
355
356 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
357 if (mi->blk[i].start != mi->blk[i].end &&
358 mi->blk[i].nid != NUMA_NO_NODE)
359 node_set(mi->blk[i].nid, *nodemask);
360}
361
362/**
363 * numa_reset_distance - Reset NUMA distance table
364 *
365 * The current table is freed. The next numa_set_distance() call will
366 * create a new one.
367 */
368void __init numa_reset_distance(void)
369{
370 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
371
372 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance),
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */
378}
379
380static int __init numa_alloc_distance(void)
381{
382 nodemask_t nodes_parsed;
383 size_t size;
384 int i, j, cnt = 0;
385 u64 phys;
386
387 /* size the new table and allocate it */
388 nodes_parsed = numa_nodes_parsed;
389 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
390
391 for_each_node_mask(i, nodes_parsed)
392 cnt = i;
393 cnt++;
394 size = cnt * cnt * sizeof(numa_distance[0]);
395
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU;
402 return -ENOMEM;
403 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
405
406 numa_distance = __va(phys);
407 numa_distance_cnt = cnt;
408
409 /* fill with the default distances */
410 for (i = 0; i < cnt; i++)
411 for (j = 0; j < cnt; j++)
412 numa_distance[i * cnt + j] = i == j ?
413 LOCAL_DISTANCE : REMOTE_DISTANCE;
414 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
415
416 return 0;
417}
418
419/**
420 * numa_set_distance - Set NUMA distance from one NUMA to another
421 * @from: the 'from' node to set distance
422 * @to: the 'to' node to set distance
423 * @distance: NUMA distance
424 *
425 * Set the distance from node @from to @to to @distance. If distance table
426 * doesn't exist, one which is large enough to accommodate all the currently
427 * known nodes will be created.
428 *
429 * If such table cannot be allocated, a warning is printed and further
430 * calls are ignored until the distance table is reset with
431 * numa_reset_distance().
432 *
433 * If @from or @to is higher than the highest known node at the time of
434 * table creation or @distance doesn't make sense, the call is ignored.
435 * This is to allow simplification of specific NUMA config implementations.
436 */
437void __init numa_set_distance(int from, int to, int distance)
438{
439 if (!numa_distance && numa_alloc_distance() < 0)
440 return;
441
442 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
443 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
444 from, to, distance);
445 return;
446 }
447
448 if ((u8)distance != distance ||
449 (from == to && distance != LOCAL_DISTANCE)) {
450 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
451 from, to, distance);
452 return;
453 }
454
455 numa_distance[from * numa_distance_cnt + to] = distance;
456}
457
458int __node_distance(int from, int to)
459{
460 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
461 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
462 return numa_distance[from * numa_distance_cnt + to];
463}
464EXPORT_SYMBOL(__node_distance);
465
466/*
467 * Sanity check to catch more bad NUMA configurations (they are amazingly
468 * common). Make sure the nodes cover all memory.
469 */
470static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
471{
472 u64 numaram, e820ram;
473 int i;
474
475 numaram = 0;
476 for (i = 0; i < mi->nr_blks; i++) {
477 u64 s = mi->blk[i].start >> PAGE_SHIFT;
478 u64 e = mi->blk[i].end >> PAGE_SHIFT;
479 numaram += e - s;
480 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
481 if ((s64)numaram < 0)
482 numaram = 0;
483 }
484
485 e820ram = max_pfn - (memblock_x86_hole_size(0,
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
490 (numaram << PAGE_SHIFT) >> 20,
491 (e820ram << PAGE_SHIFT) >> 20);
492 return false;
493 }
494 return true;
495}
496
497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{
499 int i, nid;
500
501 /* Account for nodes with cpus and no memory */
502 node_possible_map = numa_nodes_parsed;
503 numa_nodemask_from_meminfo(&node_possible_map, mi);
504 if (WARN_ON(nodes_empty(node_possible_map)))
505 return -EINVAL;
506
507 for (i = 0; i < mi->nr_blks; i++)
508 memblock_x86_register_active_regions(mi->blk[i].nid,
509 mi->blk[i].start >> PAGE_SHIFT,
510 mi->blk[i].end >> PAGE_SHIFT);
511
512 /* for out of order entries */
513 sort_node_map();
514 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL;
516
517 /* Finally register nodes. */
518 for_each_node_mask(nid, node_possible_map) {
519 u64 start = PFN_PHYS(max_pfn);
520 u64 end = 0;
521
522 for (i = 0; i < mi->nr_blks; i++) {
523 if (nid != mi->blk[i].nid)
524 continue;
525 start = min(mi->blk[i].start, start);
526 end = max(mi->blk[i].end, end);
527 }
528
529 if (start < end)
530 setup_node_data(nid, start, end);
531 }
532
533 return 0;
534}
535
98/* 536/*
99 * There are unfortunately some poorly designed mainboards around that 537 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 538 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void)
102 * as the number of CPUs is not known yet. We round robin the existing 540 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes. 541 * nodes.
104 */ 542 */
105void __init numa_init_array(void) 543static void __init numa_init_array(void)
106{ 544{
107 int rr, i; 545 int rr, i;
108 546
@@ -117,6 +555,95 @@ void __init numa_init_array(void)
117 } 555 }
118} 556}
119 557
558static int __init numa_init(int (*init_func)(void))
559{
560 int i;
561 int ret;
562
563 for (i = 0; i < MAX_LOCAL_APIC; i++)
564 set_apicid_to_node(i, NUMA_NO_NODE);
565
566 nodes_clear(numa_nodes_parsed);
567 nodes_clear(node_possible_map);
568 nodes_clear(node_online_map);
569 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
570 remove_all_active_ranges();
571 numa_reset_distance();
572
573 ret = init_func();
574 if (ret < 0)
575 return ret;
576 ret = numa_cleanup_meminfo(&numa_meminfo);
577 if (ret < 0)
578 return ret;
579
580 numa_emulation(&numa_meminfo, numa_distance_cnt);
581
582 ret = numa_register_memblks(&numa_meminfo);
583 if (ret < 0)
584 return ret;
585
586 for (i = 0; i < nr_cpu_ids; i++) {
587 int nid = early_cpu_to_node(i);
588
589 if (nid == NUMA_NO_NODE)
590 continue;
591 if (!node_online(nid))
592 numa_clear_node(i);
593 }
594 numa_init_array();
595 return 0;
596}
597
598/**
599 * dummy_numa_init - Fallback dummy NUMA init
600 *
601 * Used if there's no underlying NUMA architecture, NUMA initialization
602 * fails, or NUMA is disabled on the command line.
603 *
604 * Must online at least one node and add memory blocks that cover all
605 * allowed memory. This function must not fail.
606 */
607static int __init dummy_numa_init(void)
608{
609 printk(KERN_INFO "%s\n",
610 numa_off ? "NUMA turned off" : "No NUMA configuration found");
611 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
612 0LLU, PFN_PHYS(max_pfn));
613
614 node_set(0, numa_nodes_parsed);
615 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
616
617 return 0;
618}
619
620/**
621 * x86_numa_init - Initialize NUMA
622 *
623 * Try each configured NUMA initialization method until one succeeds. The
624 * last fallback is dummy single node config encomapssing whole memory and
625 * never fails.
626 */
627void __init x86_numa_init(void)
628{
629 if (!numa_off) {
630#ifdef CONFIG_X86_NUMAQ
631 if (!numa_init(numaq_numa_init))
632 return;
633#endif
634#ifdef CONFIG_ACPI_NUMA
635 if (!numa_init(x86_acpi_numa_init))
636 return;
637#endif
638#ifdef CONFIG_AMD_NUMA
639 if (!numa_init(amd_numa_init))
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645}
646
120static __init int find_near_online_node(int node) 647static __init int find_near_online_node(int node)
121{ 648{
122 int n, val; 649 int n, val;
@@ -213,53 +740,48 @@ int early_cpu_to_node(int cpu)
213 return per_cpu(x86_cpu_to_node_map, cpu); 740 return per_cpu(x86_cpu_to_node_map, cpu);
214} 741}
215 742
216struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 743void debug_cpumask_set_cpu(int cpu, int node, bool enable)
217{ 744{
218 int node = early_cpu_to_node(cpu);
219 struct cpumask *mask; 745 struct cpumask *mask;
220 char buf[64]; 746 char buf[64];
221 747
222 if (node == NUMA_NO_NODE) { 748 if (node == NUMA_NO_NODE) {
223 /* early_cpu_to_node() already emits a warning and trace */ 749 /* early_cpu_to_node() already emits a warning and trace */
224 return NULL; 750 return;
225 } 751 }
226 mask = node_to_cpumask_map[node]; 752 mask = node_to_cpumask_map[node];
227 if (!mask) { 753 if (!mask) {
228 pr_err("node_to_cpumask_map[%i] NULL\n", node); 754 pr_err("node_to_cpumask_map[%i] NULL\n", node);
229 dump_stack(); 755 dump_stack();
230 return NULL; 756 return;
231 } 757 }
232 758
759 if (enable)
760 cpumask_set_cpu(cpu, mask);
761 else
762 cpumask_clear_cpu(cpu, mask);
763
233 cpulist_scnprintf(buf, sizeof(buf), mask); 764 cpulist_scnprintf(buf, sizeof(buf), mask);
234 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 765 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
235 enable ? "numa_add_cpu" : "numa_remove_cpu", 766 enable ? "numa_add_cpu" : "numa_remove_cpu",
236 cpu, node, buf); 767 cpu, node, buf);
237 return mask; 768 return;
238} 769}
239 770
240# ifndef CONFIG_NUMA_EMU 771# ifndef CONFIG_NUMA_EMU
241static void __cpuinit numa_set_cpumask(int cpu, int enable) 772static void __cpuinit numa_set_cpumask(int cpu, bool enable)
242{ 773{
243 struct cpumask *mask; 774 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
244
245 mask = debug_cpumask_set_cpu(cpu, enable);
246 if (!mask)
247 return;
248
249 if (enable)
250 cpumask_set_cpu(cpu, mask);
251 else
252 cpumask_clear_cpu(cpu, mask);
253} 775}
254 776
255void __cpuinit numa_add_cpu(int cpu) 777void __cpuinit numa_add_cpu(int cpu)
256{ 778{
257 numa_set_cpumask(cpu, 1); 779 numa_set_cpumask(cpu, true);
258} 780}
259 781
260void __cpuinit numa_remove_cpu(int cpu) 782void __cpuinit numa_remove_cpu(int cpu)
261{ 783{
262 numa_set_cpumask(cpu, 0); 784 numa_set_cpumask(cpu, false);
263} 785}
264# endif /* !CONFIG_NUMA_EMU */ 786# endif /* !CONFIG_NUMA_EMU */
265 787
@@ -287,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node)
287EXPORT_SYMBOL(cpumask_of_node); 809EXPORT_SYMBOL(cpumask_of_node);
288 810
289#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 811#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
812
813#ifdef CONFIG_MEMORY_HOTPLUG
814int memory_add_physaddr_to_nid(u64 start)
815{
816 struct numa_meminfo *mi = &numa_meminfo;
817 int nid = mi->blk[0].nid;
818 int i;
819
820 for (i = 0; i < mi->nr_blks; i++)
821 if (mi->blk[i].start <= start && mi->blk[i].end > start)
822 nid = mi->blk[i].nid;
823 return nid;
824}
825EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
826#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420df..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/mm.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
27#include <linux/memblock.h> 26#include <linux/memblock.h>
28#include <linux/mmzone.h>
29#include <linux/highmem.h>
30#include <linux/initrd.h>
31#include <linux/nodemask.h>
32#include <linux/module.h> 27#include <linux/module.h>
33#include <linux/kexec.h>
34#include <linux/pfn.h>
35#include <linux/swap.h>
36#include <linux/acpi.h>
37
38#include <asm/e820.h>
39#include <asm/setup.h>
40#include <asm/mmzone.h>
41#include <asm/bios_ebda.h>
42#include <asm/proto.h>
43
44struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
45EXPORT_SYMBOL(node_data);
46
47/*
48 * numa interface - we expect the numa architecture specific code to have
49 * populated the following initialisation.
50 *
51 * 1) node_online_map - the map of all nodes configured (online) in the system
52 * 2) node_start_pfn - the starting page frame number for a node
53 * 3) node_end_pfn - the ending page fram number for a node
54 */
55unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
56unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
57 28
29#include "numa_internal.h"
58 30
59#ifdef CONFIG_DISCONTIGMEM 31#ifdef CONFIG_DISCONTIGMEM
60/* 32/*
@@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99} 71}
100#endif 72#endif
101 73
102extern unsigned long find_max_low_pfn(void);
103extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
104 75
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
106 77
107unsigned long node_remap_size[MAX_NUMNODES];
108static void *node_remap_start_vaddr[MAX_NUMNODES]; 78static void *node_remap_start_vaddr[MAX_NUMNODES];
109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 80
111static unsigned long kva_start_pfn;
112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
119/*
120 * FLAT - support for basic PC memory model with discontig enabled, essentially
121 * a single node with all available processors in it with a flat
122 * memory map.
123 */
124int __init get_memcfg_numa_flat(void)
125{
126 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
127
128 node_start_pfn[0] = 0;
129 node_end_pfn[0] = max_pfn;
130 memblock_x86_register_active_regions(0, 0, max_pfn);
131 memory_present(0, 0, max_pfn);
132 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
133
134 /* Indicate there is one node available. */
135 nodes_clear(node_online_map);
136 node_set_online(0);
137 return 1;
138}
139
140/*
141 * Find the highest page frame number we have available for the node
142 */
143static void __init propagate_e820_map_node(int nid)
144{
145 if (node_end_pfn[nid] > max_pfn)
146 node_end_pfn[nid] = max_pfn;
147 /*
148 * if a user has given mem=XXXX, then we need to make sure
149 * that the node _starts_ before that, too, not just ends
150 */
151 if (node_start_pfn[nid] > max_pfn)
152 node_start_pfn[nid] = max_pfn;
153 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
154}
155
156/*
157 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
158 * method. For node zero take this from the bottom of memory, for
159 * subsequent nodes place them at node_remap_start_vaddr which contains
160 * node local data in physically node local memory. See setup_memory()
161 * for details.
162 */
163static void __init allocate_pgdat(int nid)
164{
165 char buf[16];
166
167 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
168 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
169 else {
170 unsigned long pgdat_phys;
171 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
172 max_pfn_mapped<<PAGE_SHIFT,
173 sizeof(pg_data_t),
174 PAGE_SIZE);
175 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
176 memset(buf, 0, sizeof(buf));
177 sprintf(buf, "NODE_DATA %d", nid);
178 memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
179 }
180 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
181 nid, (unsigned long)NODE_DATA(nid));
182}
183
184/* 81/*
185 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 82 * Remap memory allocator
186 * virtual address space (KVA) is reserved and portions of nodes are mapped
187 * using it. This is to allow node-local memory to be allocated for
188 * structures that would normally require ZONE_NORMAL. The memory is
189 * allocated with alloc_remap() and callers should be prepared to allocate
190 * from the bootmem allocator instead.
191 */ 83 */
192static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
193static void *node_remap_end_vaddr[MAX_NUMNODES]; 85static void *node_remap_end_vaddr[MAX_NUMNODES];
194static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
195static unsigned long node_remap_offset[MAX_NUMNODES];
196 87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
197void *alloc_remap(int nid, unsigned long size) 107void *alloc_remap(int nid, unsigned long size)
198{ 108{
199 void *allocation = node_remap_alloc_vaddr[nid]; 109 void *allocation = node_remap_alloc_vaddr[nid];
200 110
201 size = ALIGN(size, L1_CACHE_BYTES); 111 size = ALIGN(size, L1_CACHE_BYTES);
202 112
203 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
204 return NULL; 114 return NULL;
205 115
206 node_remap_alloc_vaddr[nid] += size; 116 node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
209 return allocation; 119 return allocation;
210} 120}
211 121
212static void __init remap_numa_kva(void)
213{
214 void *vaddr;
215 unsigned long pfn;
216 int node;
217
218 for_each_online_node(node) {
219 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
220 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
221 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
222 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
223 (unsigned long)vaddr,
224 node_remap_start_pfn[node] + pfn);
225 set_pmd_pfn((ulong) vaddr,
226 node_remap_start_pfn[node] + pfn,
227 PAGE_KERNEL_LARGE);
228 }
229 }
230}
231
232#ifdef CONFIG_HIBERNATION 122#ifdef CONFIG_HIBERNATION
233/** 123/**
234 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
240 int node; 130 int node;
241 131
242 for_each_online_node(node) { 132 for_each_online_node(node) {
243 unsigned long start_va, start_pfn, size, pfn; 133 unsigned long start_va, start_pfn, nr_pages, pfn;
244 134
245 start_va = (unsigned long)node_remap_start_vaddr[node]; 135 start_va = (unsigned long)node_remap_start_vaddr[node];
246 start_pfn = node_remap_start_pfn[node]; 136 start_pfn = node_remap_start_pfn[node];
247 size = node_remap_size[node]; 137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
248 139
249 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
250 141
251 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
252 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
253 pgd_t *pgd = pgd_base + pgd_index(vaddr); 144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
254 pud_t *pud = pud_offset(pgd, vaddr); 145 pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
264} 155}
265#endif 156#endif
266 157
267static __init unsigned long calculate_numa_remap_pages(void) 158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
268{ 181{
269 int nid; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
270 unsigned long size, reserve_pages = 0; 183 unsigned long end_pfn = end >> PAGE_SHIFT;
271 184 unsigned long size, pfn;
272 for_each_online_node(nid) { 185 u64 node_pa, remap_pa;
273 u64 node_kva_target; 186 void *remap_va;
274 u64 node_kva_final;
275
276 /*
277 * The acpi/srat node info can show hot-add memroy zones
278 * where memory could be added but not currently present.
279 */
280 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
281 nid, node_start_pfn[nid], node_end_pfn[nid]);
282 if (node_start_pfn[nid] > max_pfn)
283 continue;
284 if (!node_end_pfn[nid])
285 continue;
286 if (node_end_pfn[nid] > max_pfn)
287 node_end_pfn[nid] = max_pfn;
288
289 /* ensure the remap includes space for the pgdat. */
290 size = node_remap_size[nid] + sizeof(pg_data_t);
291
292 /* convert size to large (pmd size) pages, rounding up */
293 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
294 /* now the roundup is correct, convert to PAGE_SIZE pages */
295 size = size * PTRS_PER_PTE;
296
297 node_kva_target = round_down(node_end_pfn[nid] - size,
298 PTRS_PER_PTE);
299 node_kva_target <<= PAGE_SHIFT;
300 do {
301 node_kva_final = memblock_find_in_range(node_kva_target,
302 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
303 ((u64)size)<<PAGE_SHIFT,
304 LARGE_PAGE_BYTES);
305 node_kva_target -= LARGE_PAGE_BYTES;
306 } while (node_kva_final == MEMBLOCK_ERROR &&
307 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
308
309 if (node_kva_final == MEMBLOCK_ERROR)
310 panic("Can not get kva ram\n");
311
312 node_remap_size[nid] = size;
313 node_remap_offset[nid] = reserve_pages;
314 reserve_pages += size;
315 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
316 " node %d at %llx\n",
317 size, nid, node_kva_final>>PAGE_SHIFT);
318
319 /*
320 * prevent kva address below max_low_pfn want it on system
321 * with less memory later.
322 * layout will be: KVA address , KVA RAM
323 *
324 * we are supposed to only record the one less then max_low_pfn
325 * but we could have some hole in high memory, and it will only
326 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
327 * to use it as free.
328 * So memblock_x86_reserve_range here, hope we don't run out of that array
329 */
330 memblock_x86_reserve_range(node_kva_final,
331 node_kva_final+(((u64)size)<<PAGE_SHIFT),
332 "KVA RAM");
333
334 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
335 }
336 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
337 reserve_pages);
338 return reserve_pages;
339}
340 187
341static void init_remap_allocator(int nid) 188 /*
342{ 189 * The acpi/srat node info can show hot-add memroy zones where
343 node_remap_start_vaddr[nid] = pfn_to_kaddr( 190 * memory could be added but not currently present.
344 kva_start_pfn + node_remap_offset[nid]); 191 */
345 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + 192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
346 (node_remap_size[nid] * PAGE_SIZE); 193 nid, start_pfn, end_pfn);
347 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 194
348 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 195 /* calculate the necessary space aligned to large page size */
349 196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
350 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, 197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
351 (ulong) node_remap_start_vaddr[nid], 198 size = ALIGN(size, LARGE_PAGE_BYTES);
352 (ulong) node_remap_end_vaddr[nid]); 199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size);
216 return;
217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
353} 235}
354 236
355void __init initmem_init(void) 237void __init initmem_init(void)
356{ 238{
357 int nid; 239 x86_numa_init();
358 long kva_target_pfn;
359
360 /*
361 * When mapping a NUMA machine we allocate the node_mem_map arrays
362 * from node local memory. They are then mapped directly into KVA
363 * between zone normal and vmalloc space. Calculate the size of
364 * this space and use it to adjust the boundary between ZONE_NORMAL
365 * and ZONE_HIGHMEM.
366 */
367
368 get_memcfg_numa();
369 numa_init_array();
370
371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
372 240
373 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
374 do {
375 kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
376 max_low_pfn<<PAGE_SHIFT,
377 kva_pages<<PAGE_SHIFT,
378 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
379 kva_target_pfn -= PTRS_PER_PTE;
380 } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
381
382 if (kva_start_pfn == MEMBLOCK_ERROR)
383 panic("Can not get kva space\n");
384
385 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
386 kva_start_pfn, max_low_pfn);
387 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
388
389 /* avoid clash with initrd */
390 memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
391 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
392 "KVA PG");
393#ifdef CONFIG_HIGHMEM 241#ifdef CONFIG_HIGHMEM
394 highstart_pfn = highend_pfn = max_pfn; 242 highstart_pfn = highend_pfn = max_pfn;
395 if (max_pfn > max_low_pfn) 243 if (max_pfn > max_low_pfn)
@@ -409,51 +257,9 @@ void __init initmem_init(void)
409 257
410 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 258 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
411 (ulong) pfn_to_kaddr(max_low_pfn)); 259 (ulong) pfn_to_kaddr(max_low_pfn));
412 for_each_online_node(nid) {
413 init_remap_allocator(nid);
414
415 allocate_pgdat(nid);
416 }
417 remap_numa_kva();
418 260
419 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 261 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
420 (ulong) pfn_to_kaddr(highstart_pfn)); 262 (ulong) pfn_to_kaddr(highstart_pfn));
421 for_each_online_node(nid)
422 propagate_e820_map_node(nid);
423
424 for_each_online_node(nid) {
425 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
426 NODE_DATA(nid)->node_id = nid;
427 }
428 263
429 setup_bootmem_allocator(); 264 setup_bootmem_allocator();
430} 265}
431
432#ifdef CONFIG_MEMORY_HOTPLUG
433static int paddr_to_nid(u64 addr)
434{
435 int nid;
436 unsigned long pfn = PFN_DOWN(addr);
437
438 for_each_node(nid)
439 if (node_start_pfn[nid] <= pfn &&
440 pfn < node_end_pfn[nid])
441 return nid;
442
443 return -1;
444}
445
446/*
447 * This function is used to ask node id BEFORE memmap and mem_section's
448 * initialization (pfn_to_nid() can't be used yet).
449 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
450 */
451int memory_add_physaddr_to_nid(u64 addr)
452{
453 int nid = paddr_to_nid(addr);
454 return (nid >= 0) ? nid : 0;
455}
456
457EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
458#endif
459
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e8c00cc72033..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,646 +2,13 @@
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h> 5#include <linux/bootmem.h>
10#include <linux/memblock.h>
11#include <linux/mmzone.h>
12#include <linux/ctype.h>
13#include <linux/module.h>
14#include <linux/nodemask.h>
15#include <linux/sched.h>
16#include <linux/acpi.h>
17
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
21#include <asm/acpi.h>
22#include <asm/amd_nb.h>
23 6
24#include "numa_internal.h" 7#include "numa_internal.h"
25 8
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29nodemask_t numa_nodes_parsed __initdata;
30
31struct memnode memnode;
32
33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size;
35
36static struct numa_meminfo numa_meminfo __initdata;
37
38static int numa_distance_cnt;
39static u8 *numa_distance;
40
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
49{
50 unsigned long addr, end;
51 int i, res = -1;
52
53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
54 for (i = 0; i < mi->nr_blks; i++) {
55 addr = mi->blk[i].start;
56 end = mi->blk[i].end;
57 if (addr >= end)
58 continue;
59 if ((end >> shift) >= memnodemapsize)
60 return 0;
61 do {
62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
63 return -1;
64 memnodemap[addr >> shift] = mi->blk[i].nid;
65 addr += (1UL << shift);
66 } while (addr < end);
67 res = 1;
68 }
69 return res;
70}
71
72static int __init allocate_cachealigned_memnodemap(void)
73{
74 unsigned long addr;
75
76 memnodemap = memnode.embedded_map;
77 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
78 return 0;
79
80 addr = 0x8000;
81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
83 nodemap_size, L1_CACHE_BYTES);
84 if (nodemap_addr == MEMBLOCK_ERROR) {
85 printk(KERN_ERR
86 "NUMA: Unable to allocate Memory to Node hash map\n");
87 nodemap_addr = nodemap_size = 0;
88 return -1;
89 }
90 memnodemap = phys_to_virt(nodemap_addr);
91 memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
92
93 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
94 nodemap_addr, nodemap_addr + nodemap_size);
95 return 0;
96}
97
98/*
99 * The LSB of all start and end addresses in the node map is the value of the
100 * maximum possible shift.
101 */
102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
103{
104 int i, nodes_used = 0;
105 unsigned long start, end;
106 unsigned long bitfield = 0, memtop = 0;
107
108 for (i = 0; i < mi->nr_blks; i++) {
109 start = mi->blk[i].start;
110 end = mi->blk[i].end;
111 if (start >= end)
112 continue;
113 bitfield |= start;
114 nodes_used++;
115 if (end > memtop)
116 memtop = end;
117 }
118 if (nodes_used <= 1)
119 i = 63;
120 else
121 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
122 memnodemapsize = (memtop >> i)+1;
123 return i;
124}
125
126static int __init compute_hash_shift(const struct numa_meminfo *mi)
127{
128 int shift;
129
130 shift = extract_lsb_from_nodes(mi);
131 if (allocate_cachealigned_memnodemap())
132 return -1;
133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
134 shift);
135
136 if (populate_memnodemap(mi, shift) != 1) {
137 printk(KERN_INFO "Your memory is not aligned you need to "
138 "rebuild your kernel with a bigger NODEMAPSIZE "
139 "shift=%d\n", shift);
140 return -1;
141 }
142 return shift;
143}
144
145int __meminit __early_pfn_to_nid(unsigned long pfn)
146{
147 return phys_to_nid(pfn << PAGE_SHIFT);
148}
149
150static void * __init early_node_mem(int nodeid, unsigned long start,
151 unsigned long end, unsigned long size,
152 unsigned long align)
153{
154 unsigned long mem;
155
156 /*
157 * put it on high as possible
158 * something will go with NODE_DATA
159 */
160 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
161 start = MAX_DMA_PFN<<PAGE_SHIFT;
162 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
163 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
164 start = MAX_DMA32_PFN<<PAGE_SHIFT;
165 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
166 if (mem != MEMBLOCK_ERROR)
167 return __va(mem);
168
169 /* extend the search scope */
170 end = max_pfn_mapped << PAGE_SHIFT;
171 start = MAX_DMA_PFN << PAGE_SHIFT;
172 mem = memblock_find_in_range(start, end, size, align);
173 if (mem != MEMBLOCK_ERROR)
174 return __va(mem);
175
176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
177 size, nodeid);
178
179 return NULL;
180}
181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
239/* Initialize bootmem allocator for a node */
240void __init
241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
242{
243 unsigned long start_pfn, last_pfn, nodedata_phys;
244 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
245 int nid;
246
247 if (!end)
248 return;
249
250 /*
251 * Don't confuse VM with a node that doesn't have the
252 * minimum amount of memory:
253 */
254 if (end && (end - start) < NODE_MIN_SIZE)
255 return;
256
257 start = roundup(start, ZONE_ALIGN);
258
259 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
260 start, end);
261
262 start_pfn = start >> PAGE_SHIFT;
263 last_pfn = end >> PAGE_SHIFT;
264
265 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
266 SMP_CACHE_BYTES);
267 if (node_data[nodeid] == NULL)
268 return;
269 nodedata_phys = __pa(node_data[nodeid]);
270 memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
271 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
272 nodedata_phys + pgdat_size - 1);
273 nid = phys_to_nid(nodedata_phys);
274 if (nid != nodeid)
275 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
276
277 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
278 NODE_DATA(nodeid)->node_id = nodeid;
279 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
280 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
281
282 node_set_online(nodeid);
283}
284
285/**
286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
287 * @mi: numa_meminfo to clean up
288 *
289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
294 */
295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
296{
297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
300
301 for (i = 0; i < mi->nr_blks; i++) {
302 struct numa_memblk *bi = &mi->blk[i];
303
304 /* make sure all blocks are inside the limits */
305 bi->start = max(bi->start, low);
306 bi->end = min(bi->end, high);
307
308 /* and there's no empty block */
309 if (bi->start == bi->end) {
310 numa_remove_memblk_from(i--, mi);
311 continue;
312 }
313
314 for (j = i + 1; j < mi->nr_blks; j++) {
315 struct numa_memblk *bj = &mi->blk[j];
316 unsigned long start, end;
317
318 /*
319 * See whether there are overlapping blocks. Whine
320 * about but allow overlaps of the same nid. They
321 * will be merged below.
322 */
323 if (bi->end > bj->start && bi->start < bj->end) {
324 if (bi->nid != bj->nid) {
325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
326 bi->nid, bi->start, bi->end,
327 bj->nid, bj->start, bj->end);
328 return -EINVAL;
329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
333 }
334
335 /*
336 * Join together blocks on the same node, holes
337 * between which don't overlap with memory on other
338 * nodes.
339 */
340 if (bi->nid != bj->nid)
341 continue;
342 start = max(min(bi->start, bj->start), low);
343 end = min(max(bi->end, bj->end), high);
344 for (k = 0; k < mi->nr_blks; k++) {
345 struct numa_memblk *bk = &mi->blk[k];
346
347 if (bi->nid == bk->nid)
348 continue;
349 if (start < bk->end && end > bk->start)
350 break;
351 }
352 if (k < mi->nr_blks)
353 continue;
354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
360 }
361 }
362
363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
364 mi->blk[i].start = mi->blk[i].end = 0;
365 mi->blk[i].nid = NUMA_NO_NODE;
366 }
367
368 return 0;
369}
370
371/*
372 * Set nodes, which have memory in @mi, in *@nodemask.
373 */
374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
376{
377 int i;
378
379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
380 if (mi->blk[i].start != mi->blk[i].end &&
381 mi->blk[i].nid != NUMA_NO_NODE)
382 node_set(mi->blk[i].nid, *nodemask);
383}
384
385/**
386 * numa_reset_distance - Reset NUMA distance table
387 *
388 * The current table is freed. The next numa_set_distance() call will
389 * create a new one.
390 */
391void __init numa_reset_distance(void)
392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
394
395 /* numa_distance could be 1LU marking allocation failure, test cnt */
396 if (numa_distance_cnt)
397 memblock_x86_free_range(__pa(numa_distance),
398 __pa(numa_distance) + size);
399 numa_distance_cnt = 0;
400 numa_distance = NULL; /* enable table creation */
401}
402
403static int __init numa_alloc_distance(void)
404{
405 nodemask_t nodes_parsed;
406 size_t size;
407 int i, j, cnt = 0;
408 u64 phys;
409
410 /* size the new table and allocate it */
411 nodes_parsed = numa_nodes_parsed;
412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
413
414 for_each_node_mask(i, nodes_parsed)
415 cnt = i;
416 cnt++;
417 size = cnt * cnt * sizeof(numa_distance[0]);
418
419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
428
429 numa_distance = __va(phys);
430 numa_distance_cnt = cnt;
431
432 /* fill with the default distances */
433 for (i = 0; i < cnt; i++)
434 for (j = 0; j < cnt; j++)
435 numa_distance[i * cnt + j] = i == j ?
436 LOCAL_DISTANCE : REMOTE_DISTANCE;
437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
438
439 return 0;
440}
441
442/**
443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
461{
462 if (!numa_distance && numa_alloc_distance() < 0)
463 return;
464
465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
467 from, to, distance);
468 return;
469 }
470
471 if ((u8)distance != distance ||
472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
475 return;
476 }
477
478 numa_distance[from * numa_distance_cnt + to] = distance;
479}
480
481int __node_distance(int from, int to)
482{
483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
488
489/*
490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
497
498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
507
508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
518}
519
520static int __init numa_register_memblks(struct numa_meminfo *mi)
521{
522 int i, nid;
523
524 /* Account for nodes with cpus and no memory */
525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
529
530 memnode_shift = compute_hash_shift(mi);
531 if (memnode_shift < 0) {
532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
560 }
561
562 return 0;
563}
564
565/**
566 * dummy_numma_init - Fallback dummy NUMA init
567 *
568 * Used if there's no underlying NUMA architecture, NUMA initialization
569 * fails, or NUMA is disabled on the command line.
570 *
571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
573 */
574static int __init dummy_numa_init(void)
575{
576 printk(KERN_INFO "%s\n",
577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
579 0LU, max_pfn << PAGE_SHIFT);
580
581 node_set(0, numa_nodes_parsed);
582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
583
584 return 0;
585}
586
587static int __init numa_init(int (*init_func)(void))
588{
589 int i;
590 int ret;
591
592 for (i = 0; i < MAX_LOCAL_APIC; i++)
593 set_apicid_to_node(i, NUMA_NO_NODE);
594
595 nodes_clear(numa_nodes_parsed);
596 nodes_clear(node_possible_map);
597 nodes_clear(node_online_map);
598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
599 remove_all_active_ranges();
600 numa_reset_distance();
601
602 ret = init_func();
603 if (ret < 0)
604 return ret;
605 ret = numa_cleanup_meminfo(&numa_meminfo);
606 if (ret < 0)
607 return ret;
608
609 numa_emulation(&numa_meminfo, numa_distance_cnt);
610
611 ret = numa_register_memblks(&numa_meminfo);
612 if (ret < 0)
613 return ret;
614
615 for (i = 0; i < nr_cpu_ids; i++) {
616 int nid = early_cpu_to_node(i);
617
618 if (nid == NUMA_NO_NODE)
619 continue;
620 if (!node_online(nid))
621 numa_clear_node(i);
622 }
623 numa_init_array();
624 return 0;
625}
626
627void __init initmem_init(void) 9void __init initmem_init(void)
628{ 10{
629 int ret; 11 x86_numa_init();
630
631 if (!numa_off) {
632#ifdef CONFIG_ACPI_NUMA
633 ret = numa_init(x86_acpi_numa_init);
634 if (!ret)
635 return;
636#endif
637#ifdef CONFIG_AMD_NUMA
638 ret = numa_init(amd_numa_init);
639 if (!ret)
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645} 12}
646 13
647unsigned long __init numa_free_all_bootmem(void) 14unsigned long __init numa_free_all_bootmem(void)
@@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
656 23
657 return pages; 24 return pages;
658} 25}
659
660int __cpuinit numa_cpu_node(int cpu)
661{
662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
663
664 if (apicid != BAD_APICID)
665 return __apicid_to_node[apicid];
666 return NUMA_NO_NODE;
667}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index ad091e4cff17..d0ed086b6247 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/topology.h> 6#include <linux/topology.h>
7#include <linux/memblock.h> 7#include <linux/memblock.h>
8#include <linux/bootmem.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
9 10
10#include "numa_internal.h" 11#include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
84 nr_nodes = MAX_NUMNODES; 85 nr_nodes = MAX_NUMNODES;
85 } 86 }
86 87
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; 88 /*
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back.
91 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
88 /* 95 /*
89 * Calculate the number of big nodes that can be allocated as a result 96 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder. 97 * of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
226 */ 233 */
227 while (nodes_weight(physnode_mask)) { 234 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) { 235 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; 236 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
230 u64 start, limit, end; 237 u64 start, limit, end;
231 int phys_blk; 238 int phys_blk;
232 239
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{ 305{
299 static struct numa_meminfo ei __initdata; 306 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata; 307 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT; 308 const u64 max_addr = PFN_PHYS(max_pfn);
302 u8 *phys_dist = NULL; 309 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 310 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid; 311 int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
342 if (numa_dist_cnt) { 349 if (numa_dist_cnt) {
343 u64 phys; 350 u64 phys;
344 351
345 phys = memblock_find_in_range(0, 352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE); 353 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) { 354 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
@@ -454,10 +460,9 @@ void __cpuinit numa_remove_cpu(int cpu)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 460 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455} 461}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 462#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, int enable) 463static void __cpuinit numa_set_cpumask(int cpu, bool enable)
458{ 464{
459 struct cpumask *mask; 465 int nid, physnid;
460 int nid, physnid, i;
461 466
462 nid = early_cpu_to_node(cpu); 467 nid = early_cpu_to_node(cpu);
463 if (nid == NUMA_NO_NODE) { 468 if (nid == NUMA_NO_NODE) {
@@ -467,28 +472,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
467 472
468 physnid = emu_nid_to_phys[nid]; 473 physnid = emu_nid_to_phys[nid];
469 474
470 for_each_online_node(i) { 475 for_each_online_node(nid) {
471 if (emu_nid_to_phys[nid] != physnid) 476 if (emu_nid_to_phys[nid] != physnid)
472 continue; 477 continue;
473 478
474 mask = debug_cpumask_set_cpu(cpu, enable); 479 debug_cpumask_set_cpu(cpu, nid, enable);
475 if (!mask)
476 return;
477
478 if (enable)
479 cpumask_set_cpu(cpu, mask);
480 else
481 cpumask_clear_cpu(cpu, mask);
482 } 480 }
483} 481}
484 482
485void __cpuinit numa_add_cpu(int cpu) 483void __cpuinit numa_add_cpu(int cpu)
486{ 484{
487 numa_set_cpumask(cpu, 1); 485 numa_set_cpumask(cpu, true);
488} 486}
489 487
490void __cpuinit numa_remove_cpu(int cpu) 488void __cpuinit numa_remove_cpu(int cpu)
491{ 489{
492 numa_set_cpumask(cpu, 0); 490 numa_set_cpumask(cpu, false);
493} 491}
494#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 492#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7c..7178c3afe05e 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi); 19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void); 20void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void);
23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
22#ifdef CONFIG_NUMA_EMU 30#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo, 31void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt); 32 int numa_dist_cnt);
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c497..9f0614daea85 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
414 unsigned char *p; 414 unsigned char *p;
415 struct prefix_bits prf; 415 struct prefix_bits prf;
416 int i; 416 int i;
417 unsigned long rv;
418 417
419 p = (unsigned char *)ins_addr; 418 p = (unsigned char *)ins_addr;
420 p += skip_prefix(p, &prf); 419 p += skip_prefix(p, &prf);
421 p += get_opcode(p, &opcode); 420 p += get_opcode(p, &opcode);
422 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 421 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
423 if (reg_rop[i] == opcode) { 422 if (reg_rop[i] == opcode)
424 rv = REG_READ;
425 goto do_work; 423 goto do_work;
426 }
427 424
428 for (i = 0; i < ARRAY_SIZE(reg_wop); i++) 425 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
429 if (reg_wop[i] == opcode) { 426 if (reg_wop[i] == opcode)
430 rv = REG_WRITE;
431 goto do_work; 427 goto do_work;
432 }
433 428
434 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " 429 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
435 "0x%02x\n", opcode); 430 "0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
474 unsigned char *p; 469 unsigned char *p;
475 struct prefix_bits prf; 470 struct prefix_bits prf;
476 int i; 471 int i;
477 unsigned long rv;
478 472
479 p = (unsigned char *)ins_addr; 473 p = (unsigned char *)ins_addr;
480 p += skip_prefix(p, &prf); 474 p += skip_prefix(p, &prf);
481 p += get_opcode(p, &opcode); 475 p += get_opcode(p, &opcode);
482 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 476 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
483 if (imm_wop[i] == opcode) { 477 if (imm_wop[i] == opcode)
484 rv = IMM_WRITE;
485 goto do_work; 478 goto do_work;
486 }
487 479
488 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " 480 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
489 "0x%02x\n", opcode); 481 "0x%02x\n", opcode);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c
index 8e9d3394f6d4..81dbfdeb080d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat.c
@@ -26,8 +26,6 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct bootnode nodes_add[MAX_NUMNODES];
30
31static __init int setup_node(int pxm) 29static __init int setup_node(int pxm)
32{ 30{
33 return acpi_map_pxm_to_node(pxm); 31 return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
37{ 35{
38 printk(KERN_ERR "SRAT: SRAT not used.\n"); 36 printk(KERN_ERR "SRAT: SRAT not used.\n");
39 acpi_numa = -1; 37 acpi_numa = -1;
40 memset(nodes_add, 0, sizeof(nodes_add));
41} 38}
42 39
43static __init inline int srat_disabled(void) 40static __init inline int srat_disabled(void)
@@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131 pxm, apic_id, node); 128 pxm, apic_id, node);
132} 129}
133 130
134#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 131#ifdef CONFIG_MEMORY_HOTPLUG
135static inline int save_add_info(void) {return 1;} 132static inline int save_add_info(void) {return 1;}
136#else 133#else
137static inline int save_add_info(void) {return 0;} 134static inline int save_add_info(void) {return 0;}
138#endif 135#endif
139/*
140 * Update nodes_add[]
141 * This code supports one contiguous hot add area per node
142 */
143static void __init
144update_nodes_add(int node, unsigned long start, unsigned long end)
145{
146 unsigned long s_pfn = start >> PAGE_SHIFT;
147 unsigned long e_pfn = end >> PAGE_SHIFT;
148 int changed = 0;
149 struct bootnode *nd = &nodes_add[node];
150
151 /* I had some trouble with strange memory hotadd regions breaking
152 the boot. Be very strict here and reject anything unexpected.
153 If you want working memory hotadd write correct SRATs.
154
155 The node size check is a basic sanity check to guard against
156 mistakes */
157 if ((signed long)(end - start) < NODE_MIN_SIZE) {
158 printk(KERN_ERR "SRAT: Hotplug area too small\n");
159 return;
160 }
161
162 /* This check might be a bit too strict, but I'm keeping it for now. */
163 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
164 printk(KERN_ERR
165 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
166 s_pfn, e_pfn);
167 return;
168 }
169
170 /* Looks good */
171
172 if (nd->start == nd->end) {
173 nd->start = start;
174 nd->end = end;
175 changed = 1;
176 } else {
177 if (nd->start == end) {
178 nd->start = start;
179 changed = 1;
180 }
181 if (nd->end == start) {
182 nd->end = end;
183 changed = 1;
184 }
185 if (!changed)
186 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
187 }
188
189 if (changed) {
190 node_set(node, numa_nodes_parsed);
191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
192 nd->start, nd->end);
193 }
194}
195 136
196/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 137/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
197void __init 138void __init
198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 139acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
199{ 140{
200 unsigned long start, end; 141 u64 start, end;
201 int node, pxm; 142 int node, pxm;
202 143
203 if (srat_disabled()) 144 if (srat_disabled())
@@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
226 return; 167 return;
227 } 168 }
228 169
229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 170 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
230 start, end); 171 start, end);
231
232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
233 update_nodes_add(node, start, end);
234} 172}
235 173
236void __init acpi_numa_arch_fixup(void) {} 174void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
244 return ret; 182 return ret;
245 return srat_disabled() ? -EINVAL : 0; 183 return srat_disabled() ? -EINVAL : 0;
246} 184}
247
248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
249int memory_add_physaddr_to_nid(u64 start)
250{
251 int i, ret = 0;
252
253 for_each_node(i)
254 if (nodes_add[i].start <= start && nodes_add[i].end > start)
255 ret = i;
256
257 return ret;
258}
259EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
260#endif
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 364f36bdfad8..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/memblock.h>
29#include <linux/mmzone.h>
30#include <linux/acpi.h>
31#include <linux/nodemask.h>
32#include <asm/srat.h>
33#include <asm/topology.h>
34#include <asm/smp.h>
35#include <asm/e820.h>
36
37/*
38 * proximity macros and definitions
39 */
40#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
41#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
42#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
43#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
44/* bitmap length; _PXM is at most 255 */
45#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
46static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
47
48#define MAX_CHUNKS_PER_NODE 3
49#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
50struct node_memory_chunk_s {
51 unsigned long start_pfn;
52 unsigned long end_pfn;
53 u8 pxm; // proximity domain of node
54 u8 nid; // which cnode contains this chunk?
55 u8 bank; // which mem bank on this node
56};
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58
59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
98 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
99}
100
101/*
102 * Identify memory proximity domains and hot-remove capabilities.
103 * Fill node memory chunk list structure.
104 */
105void __init
106acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
107{
108 unsigned long long paddr, size;
109 unsigned long start_pfn, end_pfn;
110 u8 pxm;
111 struct node_memory_chunk_s *p, *q, *pend;
112
113 if (srat_disabled())
114 return;
115 if (memory_affinity->header.length !=
116 sizeof(struct acpi_srat_mem_affinity)) {
117 bad_srat();
118 return;
119 }
120
121 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
122 return; /* empty entry */
123
124 pxm = memory_affinity->proximity_domain & 0xff;
125
126 /* mark this node as "seen" in node bitmap */
127 BMAP_SET(pxm_bitmap, pxm);
128
129 /* calculate info for memory chunk structure */
130 paddr = memory_affinity->base_address;
131 size = memory_affinity->length;
132
133 start_pfn = paddr >> PAGE_SHIFT;
134 end_pfn = (paddr + size) >> PAGE_SHIFT;
135
136
137 if (num_memory_chunks >= MAXCHUNKS) {
138 printk(KERN_WARNING "Too many mem chunks in SRAT."
139 " Ignoring %lld MBytes at %llx\n",
140 size/(1024*1024), paddr);
141 return;
142 }
143
144 /* Insertion sort based on base address */
145 pend = &node_memory_chunk[num_memory_chunks];
146 for (p = &node_memory_chunk[0]; p < pend; p++) {
147 if (start_pfn < p->start_pfn)
148 break;
149 }
150 if (p < pend) {
151 for (q = pend; q >= p; q--)
152 *(q + 1) = *q;
153 }
154 p->start_pfn = start_pfn;
155 p->end_pfn = end_pfn;
156 p->pxm = pxm;
157
158 num_memory_chunks++;
159
160 printk(KERN_DEBUG "Memory range %08lx to %08lx"
161 " in proximity domain %02x %s\n",
162 start_pfn, end_pfn,
163 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) );
166}
167
168/* Callback for SLIT parsing */
169void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
170{
171}
172
173void acpi_numa_arch_fixup(void)
174{
175}
176/*
177 * The SRAT table always lists ascending addresses, so can always
178 * assume that the first "start" address that you see is the real
179 * start of the node, and that the current "end" address is after
180 * the previous one.
181 */
182static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
183{
184 /*
185 * Only add present memory as told by the e820.
186 * There is no guarantee from the SRAT that the memory it
187 * enumerates is present at boot time because it represents
188 * *possible* memory hotplug areas the same as normal RAM.
189 */
190 if (memory_chunk->start_pfn >= max_pfn) {
191 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
192 memory_chunk->start_pfn, memory_chunk->end_pfn);
193 return -1;
194 }
195 if (memory_chunk->nid != nid)
196 return -1;
197
198 if (!node_has_online_mem(nid))
199 node_start_pfn[nid] = memory_chunk->start_pfn;
200
201 if (node_start_pfn[nid] > memory_chunk->start_pfn)
202 node_start_pfn[nid] = memory_chunk->start_pfn;
203
204 if (node_end_pfn[nid] < memory_chunk->end_pfn)
205 node_end_pfn[nid] = memory_chunk->end_pfn;
206
207 return 0;
208}
209
210int __init get_memcfg_from_srat(void)
211{
212 int i, j, nid;
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (acpi_numa_init() < 0)
218 goto out_fail;
219
220 if (num_memory_chunks == 0) {
221 printk(KERN_DEBUG
222 "could not find any ACPI SRAT memory areas.\n");
223 goto out_fail;
224 }
225
226 /* Calculate total number of nodes in system from PXM bitmap and create
227 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
228 * to specify the range of _PXM values.)
229 */
230 /*
231 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
232 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
233 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
234 * approaches MAX_PXM_DOMAINS for i386.
235 */
236 nodes_clear(node_online_map);
237 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
238 if (BMAP_TEST(pxm_bitmap, i)) {
239 int nid = acpi_map_pxm_to_node(i);
240 node_set_online(nid);
241 }
242 }
243 BUG_ON(num_online_nodes() == 0);
244
245 /* set cnode id in memory chunk structure */
246 for (i = 0; i < num_memory_chunks; i++)
247 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
248
249 printk(KERN_DEBUG "pxm bitmap: ");
250 for (i = 0; i < sizeof(pxm_bitmap); i++) {
251 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
252 }
253 printk(KERN_CONT "\n");
254 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
255 num_online_nodes());
256 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
257 num_memory_chunks);
258
259 for (i = 0; i < MAX_LOCAL_APIC; i++)
260 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
261
262 for (j = 0; j < num_memory_chunks; j++){
263 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
264 printk(KERN_DEBUG
265 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
266 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
267 if (node_read_chunk(chunk->nid, chunk))
268 continue;
269
270 memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
271 min(chunk->end_pfn, max_pfn));
272 }
273 /* for out of order entries in SRAT */
274 sort_node_map();
275
276 for_each_online_node(nid) {
277 unsigned long start = node_start_pfn[nid];
278 unsigned long end = min(node_end_pfn[nid], max_pfn);
279
280 memory_present(nid, start, end);
281 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
282 }
283 return 1;
284out_fail:
285 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
286 " table\n");
287 return 0;
288}
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 000000000000..90568c33ddb0
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,4 @@
1#
2# Arch-specific network modules
3#
4obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
new file mode 100644
index 000000000000..66870223f8c5
--- /dev/null
+++ b/arch/x86/net/bpf_jit.S
@@ -0,0 +1,140 @@
1/* bpf_jit.S : BPF JIT helper functions
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/linkage.h>
11#include <asm/dwarf2.h>
12
13/*
14 * Calling convention :
15 * rdi : skb pointer
16 * esi : offset of byte(s) to fetch in skb (can be scratched)
17 * r8 : copy of skb->data
18 * r9d : hlen = skb->len - skb->data_len
19 */
20#define SKBDATA %r8
21
22sk_load_word_ind:
23 .globl sk_load_word_ind
24
25 add %ebx,%esi /* offset += X */
26# test %esi,%esi /* if (offset < 0) goto bpf_error; */
27 js bpf_error
28
29sk_load_word:
30 .globl sk_load_word
31
32 mov %r9d,%eax # hlen
33 sub %esi,%eax # hlen - offset
34 cmp $3,%eax
35 jle bpf_slow_path_word
36 mov (SKBDATA,%rsi),%eax
37 bswap %eax /* ntohl() */
38 ret
39
40
41sk_load_half_ind:
42 .globl sk_load_half_ind
43
44 add %ebx,%esi /* offset += X */
45 js bpf_error
46
47sk_load_half:
48 .globl sk_load_half
49
50 mov %r9d,%eax
51 sub %esi,%eax # hlen - offset
52 cmp $1,%eax
53 jle bpf_slow_path_half
54 movzwl (SKBDATA,%rsi),%eax
55 rol $8,%ax # ntohs()
56 ret
57
58sk_load_byte_ind:
59 .globl sk_load_byte_ind
60 add %ebx,%esi /* offset += X */
61 js bpf_error
62
63sk_load_byte:
64 .globl sk_load_byte
65
66 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
67 jle bpf_slow_path_byte
68 movzbl (SKBDATA,%rsi),%eax
69 ret
70
71/**
72 * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
73 *
74 * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
75 * Must preserve A accumulator (%eax)
76 * Inputs : %esi is the offset value, already known positive
77 */
78ENTRY(sk_load_byte_msh)
79 CFI_STARTPROC
80 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
81 jle bpf_slow_path_byte_msh
82 movzbl (SKBDATA,%rsi),%ebx
83 and $15,%bl
84 shl $2,%bl
85 ret
86 CFI_ENDPROC
87ENDPROC(sk_load_byte_msh)
88
89bpf_error:
90# force a return 0 from jit handler
91 xor %eax,%eax
92 mov -8(%rbp),%rbx
93 leaveq
94 ret
95
96/* rsi contains offset and can be scratched */
97#define bpf_slow_path_common(LEN) \
98 push %rdi; /* save skb */ \
99 push %r9; \
100 push SKBDATA; \
101/* rsi already has offset */ \
102 mov $LEN,%ecx; /* len */ \
103 lea -12(%rbp),%rdx; \
104 call skb_copy_bits; \
105 test %eax,%eax; \
106 pop SKBDATA; \
107 pop %r9; \
108 pop %rdi
109
110
111bpf_slow_path_word:
112 bpf_slow_path_common(4)
113 js bpf_error
114 mov -12(%rbp),%eax
115 bswap %eax
116 ret
117
118bpf_slow_path_half:
119 bpf_slow_path_common(2)
120 js bpf_error
121 mov -12(%rbp),%ax
122 rol $8,%ax
123 movzwl %ax,%eax
124 ret
125
126bpf_slow_path_byte:
127 bpf_slow_path_common(1)
128 js bpf_error
129 movzbl -12(%rbp),%eax
130 ret
131
132bpf_slow_path_byte_msh:
133 xchg %eax,%ebx /* dont lose A , X is about to be scratched */
134 bpf_slow_path_common(1)
135 js bpf_error
136 movzbl -12(%rbp),%eax
137 and $15,%al
138 shl $2,%al
139 xchg %eax,%ebx
140 ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 000000000000..bfab3fa10edc
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,654 @@
1/* bpf_jit_comp.c : BPF JIT compiler
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/moduleloader.h>
11#include <asm/cacheflush.h>
12#include <linux/netdevice.h>
13#include <linux/filter.h>
14
15/*
16 * Conventions :
17 * EAX : BPF A accumulator
18 * EBX : BPF X accumulator
19 * RDI : pointer to skb (first argument given to JIT function)
20 * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
21 * ECX,EDX,ESI : scratch registers
22 * r9d : skb->len - skb->data_len (headlen)
23 * r8 : skb->data
24 * -8(RBP) : saved RBX value
25 * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
26 */
27int bpf_jit_enable __read_mostly;
28
29/*
30 * assembly code in arch/x86/net/bpf_jit.S
31 */
32extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
33extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
34
35static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
36{
37 if (len == 1)
38 *ptr = bytes;
39 else if (len == 2)
40 *(u16 *)ptr = bytes;
41 else {
42 *(u32 *)ptr = bytes;
43 barrier();
44 }
45 return ptr + len;
46}
47
48#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
49
50#define EMIT1(b1) EMIT(b1, 1)
51#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
52#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
53#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
54#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
55
56#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
57#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
58
59static inline bool is_imm8(int value)
60{
61 return value <= 127 && value >= -128;
62}
63
64static inline bool is_near(int offset)
65{
66 return offset <= 127 && offset >= -128;
67}
68
69#define EMIT_JMP(offset) \
70do { \
71 if (offset) { \
72 if (is_near(offset)) \
73 EMIT2(0xeb, offset); /* jmp .+off8 */ \
74 else \
75 EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
76 } \
77} while (0)
78
79/* list of x86 cond jumps opcodes (. + s8)
80 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
81 */
82#define X86_JB 0x72
83#define X86_JAE 0x73
84#define X86_JE 0x74
85#define X86_JNE 0x75
86#define X86_JBE 0x76
87#define X86_JA 0x77
88
89#define EMIT_COND_JMP(op, offset) \
90do { \
91 if (is_near(offset)) \
92 EMIT2(op, offset); /* jxx .+off8 */ \
93 else { \
94 EMIT2(0x0f, op + 0x10); \
95 EMIT(offset, 4); /* jxx .+off32 */ \
96 } \
97} while (0)
98
99#define COND_SEL(CODE, TOP, FOP) \
100 case CODE: \
101 t_op = TOP; \
102 f_op = FOP; \
103 goto cond_branch
104
105
106#define SEEN_DATAREF 1 /* might call external helpers */
107#define SEEN_XREG 2 /* ebx is used */
108#define SEEN_MEM 4 /* use mem[] for temporary storage */
109
110static inline void bpf_flush_icache(void *start, void *end)
111{
112 mm_segment_t old_fs = get_fs();
113
114 set_fs(KERNEL_DS);
115 smp_wmb();
116 flush_icache_range((unsigned long)start, (unsigned long)end);
117 set_fs(old_fs);
118}
119
120
121void bpf_jit_compile(struct sk_filter *fp)
122{
123 u8 temp[64];
124 u8 *prog;
125 unsigned int proglen, oldproglen = 0;
126 int ilen, i;
127 int t_offset, f_offset;
128 u8 t_op, f_op, seen = 0, pass;
129 u8 *image = NULL;
130 u8 *func;
131 int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
132 unsigned int cleanup_addr; /* epilogue code offset */
133 unsigned int *addrs;
134 const struct sock_filter *filter = fp->insns;
135 int flen = fp->len;
136
137 if (!bpf_jit_enable)
138 return;
139
140 addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
141 if (addrs == NULL)
142 return;
143
144 /* Before first pass, make a rough estimation of addrs[]
145 * each bpf instruction is translated to less than 64 bytes
146 */
147 for (proglen = 0, i = 0; i < flen; i++) {
148 proglen += 64;
149 addrs[i] = proglen;
150 }
151 cleanup_addr = proglen; /* epilogue address */
152
153 for (pass = 0; pass < 10; pass++) {
154 /* no prologue/epilogue for trivial filters (RET something) */
155 proglen = 0;
156 prog = temp;
157
158 if (seen) {
159 EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
160 EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
161 /* note : must save %rbx in case bpf_error is hit */
162 if (seen & (SEEN_XREG | SEEN_DATAREF))
163 EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
164 if (seen & SEEN_XREG)
165 CLEAR_X(); /* make sure we dont leek kernel memory */
166
167 /*
168 * If this filter needs to access skb data,
169 * loads r9 and r8 with :
170 * r9 = skb->len - skb->data_len
171 * r8 = skb->data
172 */
173 if (seen & SEEN_DATAREF) {
174 if (offsetof(struct sk_buff, len) <= 127)
175 /* mov off8(%rdi),%r9d */
176 EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
177 else {
178 /* mov off32(%rdi),%r9d */
179 EMIT3(0x44, 0x8b, 0x8f);
180 EMIT(offsetof(struct sk_buff, len), 4);
181 }
182 if (is_imm8(offsetof(struct sk_buff, data_len)))
183 /* sub off8(%rdi),%r9d */
184 EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
185 else {
186 EMIT3(0x44, 0x2b, 0x8f);
187 EMIT(offsetof(struct sk_buff, data_len), 4);
188 }
189
190 if (is_imm8(offsetof(struct sk_buff, data)))
191 /* mov off8(%rdi),%r8 */
192 EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
193 else {
194 /* mov off32(%rdi),%r8 */
195 EMIT3(0x4c, 0x8b, 0x87);
196 EMIT(offsetof(struct sk_buff, data), 4);
197 }
198 }
199 }
200
201 switch (filter[0].code) {
202 case BPF_S_RET_K:
203 case BPF_S_LD_W_LEN:
204 case BPF_S_ANC_PROTOCOL:
205 case BPF_S_ANC_IFINDEX:
206 case BPF_S_ANC_MARK:
207 case BPF_S_ANC_RXHASH:
208 case BPF_S_ANC_CPU:
209 case BPF_S_ANC_QUEUE:
210 case BPF_S_LD_W_ABS:
211 case BPF_S_LD_H_ABS:
212 case BPF_S_LD_B_ABS:
213 /* first instruction sets A register (or is RET 'constant') */
214 break;
215 default:
216 /* make sure we dont leak kernel information to user */
217 CLEAR_A(); /* A = 0 */
218 }
219
220 for (i = 0; i < flen; i++) {
221 unsigned int K = filter[i].k;
222
223 switch (filter[i].code) {
224 case BPF_S_ALU_ADD_X: /* A += X; */
225 seen |= SEEN_XREG;
226 EMIT2(0x01, 0xd8); /* add %ebx,%eax */
227 break;
228 case BPF_S_ALU_ADD_K: /* A += K; */
229 if (!K)
230 break;
231 if (is_imm8(K))
232 EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
233 else
234 EMIT1_off32(0x05, K); /* add imm32,%eax */
235 break;
236 case BPF_S_ALU_SUB_X: /* A -= X; */
237 seen |= SEEN_XREG;
238 EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
239 break;
240 case BPF_S_ALU_SUB_K: /* A -= K */
241 if (!K)
242 break;
243 if (is_imm8(K))
244 EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
245 else
246 EMIT1_off32(0x2d, K); /* sub imm32,%eax */
247 break;
248 case BPF_S_ALU_MUL_X: /* A *= X; */
249 seen |= SEEN_XREG;
250 EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
251 break;
252 case BPF_S_ALU_MUL_K: /* A *= K */
253 if (is_imm8(K))
254 EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
255 else {
256 EMIT2(0x69, 0xc0); /* imul imm32,%eax */
257 EMIT(K, 4);
258 }
259 break;
260 case BPF_S_ALU_DIV_X: /* A /= X; */
261 seen |= SEEN_XREG;
262 EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
263 if (pc_ret0 != -1)
264 EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
265 else {
266 EMIT_COND_JMP(X86_JNE, 2 + 5);
267 CLEAR_A();
268 EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
269 }
270 EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
271 break;
272 case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
273 EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
274 EMIT(K, 4);
275 EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
276 break;
277 case BPF_S_ALU_AND_X:
278 seen |= SEEN_XREG;
279 EMIT2(0x21, 0xd8); /* and %ebx,%eax */
280 break;
281 case BPF_S_ALU_AND_K:
282 if (K >= 0xFFFFFF00) {
283 EMIT2(0x24, K & 0xFF); /* and imm8,%al */
284 } else if (K >= 0xFFFF0000) {
285 EMIT2(0x66, 0x25); /* and imm16,%ax */
286 EMIT2(K, 2);
287 } else {
288 EMIT1_off32(0x25, K); /* and imm32,%eax */
289 }
290 break;
291 case BPF_S_ALU_OR_X:
292 seen |= SEEN_XREG;
293 EMIT2(0x09, 0xd8); /* or %ebx,%eax */
294 break;
295 case BPF_S_ALU_OR_K:
296 if (is_imm8(K))
297 EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
298 else
299 EMIT1_off32(0x0d, K); /* or imm32,%eax */
300 break;
301 case BPF_S_ALU_LSH_X: /* A <<= X; */
302 seen |= SEEN_XREG;
303 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
304 break;
305 case BPF_S_ALU_LSH_K:
306 if (K == 0)
307 break;
308 else if (K == 1)
309 EMIT2(0xd1, 0xe0); /* shl %eax */
310 else
311 EMIT3(0xc1, 0xe0, K);
312 break;
313 case BPF_S_ALU_RSH_X: /* A >>= X; */
314 seen |= SEEN_XREG;
315 EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
316 break;
317 case BPF_S_ALU_RSH_K: /* A >>= K; */
318 if (K == 0)
319 break;
320 else if (K == 1)
321 EMIT2(0xd1, 0xe8); /* shr %eax */
322 else
323 EMIT3(0xc1, 0xe8, K);
324 break;
325 case BPF_S_ALU_NEG:
326 EMIT2(0xf7, 0xd8); /* neg %eax */
327 break;
328 case BPF_S_RET_K:
329 if (!K) {
330 if (pc_ret0 == -1)
331 pc_ret0 = i;
332 CLEAR_A();
333 } else {
334 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
335 }
336 /* fallinto */
337 case BPF_S_RET_A:
338 if (seen) {
339 if (i != flen - 1) {
340 EMIT_JMP(cleanup_addr - addrs[i]);
341 break;
342 }
343 if (seen & SEEN_XREG)
344 EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
345 EMIT1(0xc9); /* leaveq */
346 }
347 EMIT1(0xc3); /* ret */
348 break;
349 case BPF_S_MISC_TAX: /* X = A */
350 seen |= SEEN_XREG;
351 EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
352 break;
353 case BPF_S_MISC_TXA: /* A = X */
354 seen |= SEEN_XREG;
355 EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
356 break;
357 case BPF_S_LD_IMM: /* A = K */
358 if (!K)
359 CLEAR_A();
360 else
361 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
362 break;
363 case BPF_S_LDX_IMM: /* X = K */
364 seen |= SEEN_XREG;
365 if (!K)
366 CLEAR_X();
367 else
368 EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
369 break;
370 case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
371 seen |= SEEN_MEM;
372 EMIT3(0x8b, 0x45, 0xf0 - K*4);
373 break;
374 case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
375 seen |= SEEN_XREG | SEEN_MEM;
376 EMIT3(0x8b, 0x5d, 0xf0 - K*4);
377 break;
378 case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
379 seen |= SEEN_MEM;
380 EMIT3(0x89, 0x45, 0xf0 - K*4);
381 break;
382 case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
383 seen |= SEEN_XREG | SEEN_MEM;
384 EMIT3(0x89, 0x5d, 0xf0 - K*4);
385 break;
386 case BPF_S_LD_W_LEN: /* A = skb->len; */
387 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
388 if (is_imm8(offsetof(struct sk_buff, len)))
389 /* mov off8(%rdi),%eax */
390 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
391 else {
392 EMIT2(0x8b, 0x87);
393 EMIT(offsetof(struct sk_buff, len), 4);
394 }
395 break;
396 case BPF_S_LDX_W_LEN: /* X = skb->len; */
397 seen |= SEEN_XREG;
398 if (is_imm8(offsetof(struct sk_buff, len)))
399 /* mov off8(%rdi),%ebx */
400 EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
401 else {
402 EMIT2(0x8b, 0x9f);
403 EMIT(offsetof(struct sk_buff, len), 4);
404 }
405 break;
406 case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
407 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
408 if (is_imm8(offsetof(struct sk_buff, protocol))) {
409 /* movzwl off8(%rdi),%eax */
410 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
411 } else {
412 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
413 EMIT(offsetof(struct sk_buff, protocol), 4);
414 }
415 EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
416 break;
417 case BPF_S_ANC_IFINDEX:
418 if (is_imm8(offsetof(struct sk_buff, dev))) {
419 /* movq off8(%rdi),%rax */
420 EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
421 } else {
422 EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
423 EMIT(offsetof(struct sk_buff, dev), 4);
424 }
425 EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
426 EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
427 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
428 EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
429 EMIT(offsetof(struct net_device, ifindex), 4);
430 break;
431 case BPF_S_ANC_MARK:
432 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
433 if (is_imm8(offsetof(struct sk_buff, mark))) {
434 /* mov off8(%rdi),%eax */
435 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
436 } else {
437 EMIT2(0x8b, 0x87);
438 EMIT(offsetof(struct sk_buff, mark), 4);
439 }
440 break;
441 case BPF_S_ANC_RXHASH:
442 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
443 if (is_imm8(offsetof(struct sk_buff, rxhash))) {
444 /* mov off8(%rdi),%eax */
445 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash));
446 } else {
447 EMIT2(0x8b, 0x87);
448 EMIT(offsetof(struct sk_buff, rxhash), 4);
449 }
450 break;
451 case BPF_S_ANC_QUEUE:
452 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
453 if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
454 /* movzwl off8(%rdi),%eax */
455 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
456 } else {
457 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
458 EMIT(offsetof(struct sk_buff, queue_mapping), 4);
459 }
460 break;
461 case BPF_S_ANC_CPU:
462#ifdef CONFIG_SMP
463 EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
464 EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
465#else
466 CLEAR_A();
467#endif
468 break;
469 case BPF_S_LD_W_ABS:
470 func = sk_load_word;
471common_load: seen |= SEEN_DATAREF;
472 if ((int)K < 0)
473 goto out;
474 t_offset = func - (image + addrs[i]);
475 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
476 EMIT1_off32(0xe8, t_offset); /* call */
477 break;
478 case BPF_S_LD_H_ABS:
479 func = sk_load_half;
480 goto common_load;
481 case BPF_S_LD_B_ABS:
482 func = sk_load_byte;
483 goto common_load;
484 case BPF_S_LDX_B_MSH:
485 if ((int)K < 0) {
486 if (pc_ret0 != -1) {
487 EMIT_JMP(addrs[pc_ret0] - addrs[i]);
488 break;
489 }
490 CLEAR_A();
491 EMIT_JMP(cleanup_addr - addrs[i]);
492 break;
493 }
494 seen |= SEEN_DATAREF | SEEN_XREG;
495 t_offset = sk_load_byte_msh - (image + addrs[i]);
496 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
497 EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
498 break;
499 case BPF_S_LD_W_IND:
500 func = sk_load_word_ind;
501common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
502 t_offset = func - (image + addrs[i]);
503 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
504 EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
505 break;
506 case BPF_S_LD_H_IND:
507 func = sk_load_half_ind;
508 goto common_load_ind;
509 case BPF_S_LD_B_IND:
510 func = sk_load_byte_ind;
511 goto common_load_ind;
512 case BPF_S_JMP_JA:
513 t_offset = addrs[i + K] - addrs[i];
514 EMIT_JMP(t_offset);
515 break;
516 COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
517 COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
518 COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
519 COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
520 COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
521 COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
522 COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
523 COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
524
525cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
526 t_offset = addrs[i + filter[i].jt] - addrs[i];
527
528 /* same targets, can avoid doing the test :) */
529 if (filter[i].jt == filter[i].jf) {
530 EMIT_JMP(t_offset);
531 break;
532 }
533
534 switch (filter[i].code) {
535 case BPF_S_JMP_JGT_X:
536 case BPF_S_JMP_JGE_X:
537 case BPF_S_JMP_JEQ_X:
538 seen |= SEEN_XREG;
539 EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
540 break;
541 case BPF_S_JMP_JSET_X:
542 seen |= SEEN_XREG;
543 EMIT2(0x85, 0xd8); /* test %ebx,%eax */
544 break;
545 case BPF_S_JMP_JEQ_K:
546 if (K == 0) {
547 EMIT2(0x85, 0xc0); /* test %eax,%eax */
548 break;
549 }
550 case BPF_S_JMP_JGT_K:
551 case BPF_S_JMP_JGE_K:
552 if (K <= 127)
553 EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
554 else
555 EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
556 break;
557 case BPF_S_JMP_JSET_K:
558 if (K <= 0xFF)
559 EMIT2(0xa8, K); /* test imm8,%al */
560 else if (!(K & 0xFFFF00FF))
561 EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
562 else if (K <= 0xFFFF) {
563 EMIT2(0x66, 0xa9); /* test imm16,%ax */
564 EMIT(K, 2);
565 } else {
566 EMIT1_off32(0xa9, K); /* test imm32,%eax */
567 }
568 break;
569 }
570 if (filter[i].jt != 0) {
571 if (filter[i].jf)
572 t_offset += is_near(f_offset) ? 2 : 6;
573 EMIT_COND_JMP(t_op, t_offset);
574 if (filter[i].jf)
575 EMIT_JMP(f_offset);
576 break;
577 }
578 EMIT_COND_JMP(f_op, f_offset);
579 break;
580 default:
581 /* hmm, too complex filter, give up with jit compiler */
582 goto out;
583 }
584 ilen = prog - temp;
585 if (image) {
586 if (unlikely(proglen + ilen > oldproglen)) {
587 pr_err("bpb_jit_compile fatal error\n");
588 kfree(addrs);
589 module_free(NULL, image);
590 return;
591 }
592 memcpy(image + proglen, temp, ilen);
593 }
594 proglen += ilen;
595 addrs[i] = proglen;
596 prog = temp;
597 }
598 /* last bpf instruction is always a RET :
599 * use it to give the cleanup instruction(s) addr
600 */
601 cleanup_addr = proglen - 1; /* ret */
602 if (seen)
603 cleanup_addr -= 1; /* leaveq */
604 if (seen & SEEN_XREG)
605 cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
606
607 if (image) {
608 WARN_ON(proglen != oldproglen);
609 break;
610 }
611 if (proglen == oldproglen) {
612 image = module_alloc(max_t(unsigned int,
613 proglen,
614 sizeof(struct work_struct)));
615 if (!image)
616 goto out;
617 }
618 oldproglen = proglen;
619 }
620 if (bpf_jit_enable > 1)
621 pr_err("flen=%d proglen=%u pass=%d image=%p\n",
622 flen, proglen, pass, image);
623
624 if (image) {
625 if (bpf_jit_enable > 1)
626 print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
627 16, 1, image, proglen, false);
628
629 bpf_flush_icache(image, image + proglen);
630
631 fp->bpf_func = (void *)image;
632 }
633out:
634 kfree(addrs);
635 return;
636}
637
638static void jit_free_defer(struct work_struct *arg)
639{
640 module_free(NULL, arg);
641}
642
643/* run from softirq, we must use a work_struct to call
644 * module_free() from process context
645 */
646void bpf_jit_free(struct sk_filter *fp)
647{
648 if (fp->bpf_func != sk_run_filter) {
649 struct work_struct *work = (struct work_struct *)fp->bpf_func;
650
651 INIT_WORK(work, jit_free_defer);
652 schedule_work(work);
653 }
654}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..a5b64ab4cd6e 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -16,17 +16,6 @@
16#include <asm/stacktrace.h> 16#include <asm/stacktrace.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18 18
19static void backtrace_warning_symbol(void *data, char *msg,
20 unsigned long symbol)
21{
22 /* Ignore warnings */
23}
24
25static void backtrace_warning(void *data, char *msg)
26{
27 /* Ignore warnings */
28}
29
30static int backtrace_stack(void *data, char *name) 19static int backtrace_stack(void *data, char *name)
31{ 20{
32 /* Yes, we want all stacks */ 21 /* Yes, we want all stacks */
@@ -42,8 +31,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
42} 31}
43 32
44static struct stacktrace_ops backtrace_ops = { 33static struct stacktrace_ops backtrace_ops = {
45 .warning = backtrace_warning,
46 .warning_symbol = backtrace_warning_symbol,
47 .stack = backtrace_stack, 34 .stack = backtrace_stack,
48 .address = backtrace_address, 35 .address = backtrace_address,
49 .walk_stack = print_context_stack, 36 .walk_stack = print_context_stack,
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd33620b0071..e6fd8473fb7b 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -280,12 +280,9 @@ void __init pci_direct_init(int type)
280 280
281int __init pci_direct_probe(void) 281int __init pci_direct_probe(void)
282{ 282{
283 struct resource *region, *region2;
284
285 if ((pci_probe & PCI_PROBE_CONF1) == 0) 283 if ((pci_probe & PCI_PROBE_CONF1) == 0)
286 goto type2; 284 goto type2;
287 region = request_region(0xCF8, 8, "PCI conf1"); 285 if (!request_region(0xCF8, 8, "PCI conf1"))
288 if (!region)
289 goto type2; 286 goto type2;
290 287
291 if (pci_check_type1()) { 288 if (pci_check_type1()) {
@@ -293,16 +290,14 @@ int __init pci_direct_probe(void)
293 port_cf9_safe = true; 290 port_cf9_safe = true;
294 return 1; 291 return 1;
295 } 292 }
296 release_resource(region); 293 release_region(0xCF8, 8);
297 294
298 type2: 295 type2:
299 if ((pci_probe & PCI_PROBE_CONF2) == 0) 296 if ((pci_probe & PCI_PROBE_CONF2) == 0)
300 return 0; 297 return 0;
301 region = request_region(0xCF8, 4, "PCI conf2"); 298 if (!request_region(0xCF8, 4, "PCI conf2"))
302 if (!region)
303 return 0; 299 return 0;
304 region2 = request_region(0xC000, 0x1000, "PCI conf2"); 300 if (!request_region(0xC000, 0x1000, "PCI conf2"))
305 if (!region2)
306 goto fail2; 301 goto fail2;
307 302
308 if (pci_check_type2()) { 303 if (pci_check_type2()) {
@@ -311,8 +306,8 @@ int __init pci_direct_probe(void)
311 return 2; 306 return 2;
312 } 307 }
313 308
314 release_resource(region2); 309 release_region(0xC000, 0x1000);
315 fail2: 310 fail2:
316 release_resource(region); 311 release_region(0xCF8, 4);
317 return 0; 312 return 0;
318} 313}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8201165bae28..372e9b8989b3 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -602,7 +602,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && 602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) 603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && 604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) { 605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
606 || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
607 device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
606 r->name = "PIIX/ICH"; 608 r->name = "PIIX/ICH";
607 r->get = pirq_piix_get; 609 r->get = pirq_piix_get;
608 r->set = pirq_piix_set; 610 r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index e282886616a0..750c346ef50a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -606,6 +606,16 @@ static void __init __pci_mmcfg_init(int early)
606 if (list_empty(&pci_mmcfg_list)) 606 if (list_empty(&pci_mmcfg_list))
607 return; 607 return;
608 608
609 if (pcibios_last_bus < 0) {
610 const struct pci_mmcfg_region *cfg;
611
612 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
613 if (cfg->segment)
614 break;
615 pcibios_last_bus = cfg->end_bus;
616 }
617 }
618
609 if (pci_mmcfg_arch_init()) 619 if (pci_mmcfg_arch_init())
610 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 620 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
611 else { 621 else {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e37b407a0ee8..8214724ce54d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -108,7 +108,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
108 } 108 }
109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
110 (type == PCI_CAP_ID_MSIX) ? 110 (type == PCI_CAP_ID_MSIX) ?
111 "msi-x" : "msi"); 111 "msi-x" : "msi",
112 DOMID_SELF);
112 if (irq < 0) 113 if (irq < 0)
113 goto error; 114 goto error;
114 dev_dbg(&dev->dev, 115 dev_dbg(&dev->dev,
@@ -148,7 +149,8 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, 149 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
149 (type == PCI_CAP_ID_MSIX) ? 150 (type == PCI_CAP_ID_MSIX) ?
150 "pcifront-msi-x" : 151 "pcifront-msi-x" :
151 "pcifront-msi"); 152 "pcifront-msi",
153 DOMID_SELF);
152 if (irq < 0) 154 if (irq < 0)
153 goto free; 155 goto free;
154 i++; 156 i++;
@@ -190,9 +192,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
190 192
191 list_for_each_entry(msidesc, &dev->msi_list, list) { 193 list_for_each_entry(msidesc, &dev->msi_list, list) {
192 struct physdev_map_pirq map_irq; 194 struct physdev_map_pirq map_irq;
195 domid_t domid;
196
197 domid = ret = xen_find_device_domain_owner(dev);
198 /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
199 * hence check ret value for < 0. */
200 if (ret < 0)
201 domid = DOMID_SELF;
193 202
194 memset(&map_irq, 0, sizeof(map_irq)); 203 memset(&map_irq, 0, sizeof(map_irq));
195 map_irq.domid = DOMID_SELF; 204 map_irq.domid = domid;
196 map_irq.type = MAP_PIRQ_TYPE_MSI; 205 map_irq.type = MAP_PIRQ_TYPE_MSI;
197 map_irq.index = -1; 206 map_irq.index = -1;
198 map_irq.pirq = -1; 207 map_irq.pirq = -1;
@@ -215,14 +224,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
215 224
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 225 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) { 226 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret); 227 dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
228 ret, domid);
219 goto out; 229 goto out;
220 } 230 }
221 231
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, 232 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index, 233 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ? 234 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi"); 235 "msi-x" : "msi",
236 domid);
226 if (ret < 0) 237 if (ret < 0)
227 goto out; 238 goto out;
228 } 239 }
@@ -461,3 +472,78 @@ void __init xen_setup_pirqs(void)
461 } 472 }
462} 473}
463#endif 474#endif
475
476#ifdef CONFIG_XEN_DOM0
477struct xen_device_domain_owner {
478 domid_t domain;
479 struct pci_dev *dev;
480 struct list_head list;
481};
482
483static DEFINE_SPINLOCK(dev_domain_list_spinlock);
484static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
485
486static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
487{
488 struct xen_device_domain_owner *owner;
489
490 list_for_each_entry(owner, &dev_domain_list, list) {
491 if (owner->dev == dev)
492 return owner;
493 }
494 return NULL;
495}
496
497int xen_find_device_domain_owner(struct pci_dev *dev)
498{
499 struct xen_device_domain_owner *owner;
500 int domain = -ENODEV;
501
502 spin_lock(&dev_domain_list_spinlock);
503 owner = find_device(dev);
504 if (owner)
505 domain = owner->domain;
506 spin_unlock(&dev_domain_list_spinlock);
507 return domain;
508}
509EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
510
511int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
512{
513 struct xen_device_domain_owner *owner;
514
515 owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
516 if (!owner)
517 return -ENODEV;
518
519 spin_lock(&dev_domain_list_spinlock);
520 if (find_device(dev)) {
521 spin_unlock(&dev_domain_list_spinlock);
522 kfree(owner);
523 return -EEXIST;
524 }
525 owner->domain = domain;
526 owner->dev = dev;
527 list_add_tail(&owner->list, &dev_domain_list);
528 spin_unlock(&dev_domain_list_spinlock);
529 return 0;
530}
531EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
532
533int xen_unregister_device_domain_owner(struct pci_dev *dev)
534{
535 struct xen_device_domain_owner *owner;
536
537 spin_lock(&dev_domain_list_spinlock);
538 owner = find_device(dev);
539 if (!owner) {
540 spin_unlock(&dev_domain_list_spinlock);
541 return -ENODEV;
542 }
543 list_del(&owner->list);
544 spin_unlock(&dev_domain_list_spinlock);
545 kfree(owner);
546 return 0;
547}
548EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
549#endif
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index dc701ea58546..e70be38ce039 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -74,6 +74,7 @@
74 compatible = "intel,ce4100-pci", "pci"; 74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci"; 75 device_type = "pci";
76 bus-range = <1 1>; 76 bus-range = <1 1>;
77 reg = <0x0800 0x0 0x0 0x0 0x0>;
77 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; 78 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
78 79
79 interrupt-parent = <&ioapic2>; 80 interrupt-parent = <&ioapic2>;
@@ -346,7 +347,7 @@
346 "pciclass0c03"; 347 "pciclass0c03";
347 348
348 reg = <0x16800 0x0 0x0 0x0 0x0>; 349 reg = <0x16800 0x0 0x0 0x0 0x0>;
349 interrupts = <22 3>; 350 interrupts = <22 1>;
350 }; 351 };
351 352
352 usb@d,1 { 353 usb@d,1 {
@@ -356,7 +357,7 @@
356 "pciclass0c03"; 357 "pciclass0c03";
357 358
358 reg = <0x16900 0x0 0x0 0x0 0x0>; 359 reg = <0x16900 0x0 0x0 0x0 0x0>;
359 interrupts = <22 3>; 360 interrupts = <22 1>;
360 }; 361 };
361 362
362 sata@e,0 { 363 sata@e,0 {
@@ -366,7 +367,7 @@
366 "pciclass0106"; 367 "pciclass0106";
367 368
368 reg = <0x17000 0x0 0x0 0x0 0x0>; 369 reg = <0x17000 0x0 0x0 0x0 0x0>;
369 interrupts = <23 3>; 370 interrupts = <23 1>;
370 }; 371 };
371 372
372 flash@f,0 { 373 flash@f,0 {
@@ -412,6 +413,7 @@
412 #address-cells = <2>; 413 #address-cells = <2>;
413 #size-cells = <1>; 414 #size-cells = <1>;
414 compatible = "isa"; 415 compatible = "isa";
416 reg = <0xf800 0x0 0x0 0x0 0x0>;
415 ranges = <1 0 0 0 0 0x100>; 417 ranges = <1 0 0 0 0 0x100>;
416 418
417 rtc@70 { 419 rtc@70 {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0fe27d7c6258..b30aa26a8df2 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -145,17 +145,6 @@ static void virt_efi_reset_system(int reset_type,
145 data_size, data); 145 data_size, data);
146} 146}
147 147
148static efi_status_t virt_efi_set_virtual_address_map(
149 unsigned long memory_map_size,
150 unsigned long descriptor_size,
151 u32 descriptor_version,
152 efi_memory_desc_t *virtual_map)
153{
154 return efi_call_virt4(set_virtual_address_map,
155 memory_map_size, descriptor_size,
156 descriptor_version, virtual_map);
157}
158
159static efi_status_t __init phys_efi_set_virtual_address_map( 148static efi_status_t __init phys_efi_set_virtual_address_map(
160 unsigned long memory_map_size, 149 unsigned long memory_map_size,
161 unsigned long descriptor_size, 150 unsigned long descriptor_size,
@@ -468,11 +457,25 @@ void __init efi_init(void)
468#endif 457#endif
469} 458}
470 459
460void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
461{
462 u64 addr, npages;
463
464 addr = md->virt_addr;
465 npages = md->num_pages;
466
467 memrange_efi_to_native(&addr, &npages);
468
469 if (executable)
470 set_memory_x(addr, npages);
471 else
472 set_memory_nx(addr, npages);
473}
474
471static void __init runtime_code_page_mkexec(void) 475static void __init runtime_code_page_mkexec(void)
472{ 476{
473 efi_memory_desc_t *md; 477 efi_memory_desc_t *md;
474 void *p; 478 void *p;
475 u64 addr, npages;
476 479
477 /* Make EFI runtime service code area executable */ 480 /* Make EFI runtime service code area executable */
478 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 481 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -481,10 +484,7 @@ static void __init runtime_code_page_mkexec(void)
481 if (md->type != EFI_RUNTIME_SERVICES_CODE) 484 if (md->type != EFI_RUNTIME_SERVICES_CODE)
482 continue; 485 continue;
483 486
484 addr = md->virt_addr; 487 efi_set_executable(md, true);
485 npages = md->num_pages;
486 memrange_efi_to_native(&addr, &npages);
487 set_memory_x(addr, npages);
488 } 488 }
489} 489}
490 490
@@ -498,13 +498,42 @@ static void __init runtime_code_page_mkexec(void)
498 */ 498 */
499void __init efi_enter_virtual_mode(void) 499void __init efi_enter_virtual_mode(void)
500{ 500{
501 efi_memory_desc_t *md; 501 efi_memory_desc_t *md, *prev_md = NULL;
502 efi_status_t status; 502 efi_status_t status;
503 unsigned long size; 503 unsigned long size;
504 u64 end, systab, addr, npages, end_pfn; 504 u64 end, systab, addr, npages, end_pfn;
505 void *p, *va; 505 void *p, *va, *new_memmap = NULL;
506 int count = 0;
506 507
507 efi.systab = NULL; 508 efi.systab = NULL;
509
510 /* Merge contiguous regions of the same type and attribute */
511 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
512 u64 prev_size;
513 md = p;
514
515 if (!prev_md) {
516 prev_md = md;
517 continue;
518 }
519
520 if (prev_md->type != md->type ||
521 prev_md->attribute != md->attribute) {
522 prev_md = md;
523 continue;
524 }
525
526 prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
527
528 if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
529 prev_md->num_pages += md->num_pages;
530 md->type = EFI_RESERVED_TYPE;
531 md->attribute = 0;
532 continue;
533 }
534 prev_md = md;
535 }
536
508 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 537 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
509 md = p; 538 md = p;
510 if (!(md->attribute & EFI_MEMORY_RUNTIME)) 539 if (!(md->attribute & EFI_MEMORY_RUNTIME))
@@ -541,15 +570,21 @@ void __init efi_enter_virtual_mode(void)
541 systab += md->virt_addr - md->phys_addr; 570 systab += md->virt_addr - md->phys_addr;
542 efi.systab = (efi_system_table_t *) (unsigned long) systab; 571 efi.systab = (efi_system_table_t *) (unsigned long) systab;
543 } 572 }
573 new_memmap = krealloc(new_memmap,
574 (count + 1) * memmap.desc_size,
575 GFP_KERNEL);
576 memcpy(new_memmap + (count * memmap.desc_size), md,
577 memmap.desc_size);
578 count++;
544 } 579 }
545 580
546 BUG_ON(!efi.systab); 581 BUG_ON(!efi.systab);
547 582
548 status = phys_efi_set_virtual_address_map( 583 status = phys_efi_set_virtual_address_map(
549 memmap.desc_size * memmap.nr_map, 584 memmap.desc_size * count,
550 memmap.desc_size, 585 memmap.desc_size,
551 memmap.desc_version, 586 memmap.desc_version,
552 memmap.phys_map); 587 (efi_memory_desc_t *)__pa(new_memmap));
553 588
554 if (status != EFI_SUCCESS) { 589 if (status != EFI_SUCCESS) {
555 printk(KERN_ALERT "Unable to switch EFI into virtual mode " 590 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -572,11 +607,12 @@ void __init efi_enter_virtual_mode(void)
572 efi.set_variable = virt_efi_set_variable; 607 efi.set_variable = virt_efi_set_variable;
573 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; 608 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
574 efi.reset_system = virt_efi_reset_system; 609 efi.reset_system = virt_efi_reset_system;
575 efi.set_virtual_address_map = virt_efi_set_virtual_address_map; 610 efi.set_virtual_address_map = NULL;
576 if (__supported_pte_mask & _PAGE_NX) 611 if (__supported_pte_mask & _PAGE_NX)
577 runtime_code_page_mkexec(); 612 runtime_code_page_mkexec();
578 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); 613 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
579 memmap.map = NULL; 614 memmap.map = NULL;
615 kfree(new_memmap);
580} 616}
581 617
582/* 618/*
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3d..2649426a7905 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
41static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
42static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
43 43
44static void __init early_mapping_set_exec(unsigned long start, 44static void __init early_code_mapping_set_exec(int executable)
45 unsigned long end,
46 int executable)
47{
48 unsigned long num_pages;
49
50 start &= PMD_MASK;
51 end = (end + PMD_SIZE - 1) & PMD_MASK;
52 num_pages = (end - start) >> PAGE_SHIFT;
53 if (executable)
54 set_memory_x((unsigned long)__va(start), num_pages);
55 else
56 set_memory_nx((unsigned long)__va(start), num_pages);
57}
58
59static void __init early_runtime_code_mapping_set_exec(int executable)
60{ 45{
61 efi_memory_desc_t *md; 46 efi_memory_desc_t *md;
62 void *p; 47 void *p;
@@ -67,11 +52,8 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
67 /* Make EFI runtime service code area executable */ 52 /* Make EFI runtime service code area executable */
68 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 53 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
69 md = p; 54 md = p;
70 if (md->type == EFI_RUNTIME_SERVICES_CODE) { 55 if (md->type == EFI_RUNTIME_SERVICES_CODE)
71 unsigned long end; 56 efi_set_executable(md, executable);
72 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
73 early_mapping_set_exec(md->phys_addr, end, executable);
74 }
75 } 57 }
76} 58}
77 59
@@ -79,7 +61,7 @@ void __init efi_call_phys_prelog(void)
79{ 61{
80 unsigned long vaddress; 62 unsigned long vaddress;
81 63
82 early_runtime_code_mapping_set_exec(1); 64 early_code_mapping_set_exec(1);
83 local_irq_save(efi_flags); 65 local_irq_save(efi_flags);
84 vaddress = (unsigned long)__va(0x0UL); 66 vaddress = (unsigned long)__va(0x0UL);
85 save_pgd = *pgd_offset_k(0x0UL); 67 save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +77,7 @@ void __init efi_call_phys_epilog(void)
95 set_pgd(pgd_offset_k(0x0UL), save_pgd); 77 set_pgd(pgd_offset_k(0x0UL), save_pgd);
96 __flush_tlb_all(); 78 __flush_tlb_all();
97 local_irq_restore(efi_flags); 79 local_irq_restore(efi_flags);
98 early_runtime_code_mapping_set_exec(0); 80 early_code_mapping_set_exec(0);
99} 81}
100 82
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, 83void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
@@ -107,8 +89,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
107 return ioremap(phys_addr, size); 89 return ioremap(phys_addr, size);
108 90
109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 91 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 92 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
111 return NULL; 93 unsigned long top = last_map_pfn << PAGE_SHIFT;
94 efi_ioremap(top, size - (top - phys_addr), type);
95 }
112 96
113 return (void __iomem *)__va(phys_addr); 97 return (void __iomem *)__va(phys_addr);
114} 98}
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 5c0207bf959b..7000e74b3087 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -97,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
97 pentry->freq_hz, pentry->irq); 97 pentry->freq_hz, pentry->irq);
98 if (!pentry->irq) 98 if (!pentry->irq)
99 continue; 99 continue;
100 mp_irq.type = MP_IOAPIC; 100 mp_irq.type = MP_INTSRC;
101 mp_irq.irqtype = mp_INT; 101 mp_irq.irqtype = mp_INT;
102/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ 102/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
103 mp_irq.irqflag = 5; 103 mp_irq.irqflag = 5;
104 mp_irq.srcbus = 0; 104 mp_irq.srcbus = MP_BUS_ISA;
105 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 105 mp_irq.srcbusirq = pentry->irq; /* IRQ */
106 mp_irq.dstapic = MP_APIC_ALL; 106 mp_irq.dstapic = MP_APIC_ALL;
107 mp_irq.dstirq = pentry->irq; 107 mp_irq.dstirq = pentry->irq;
@@ -168,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
168 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { 168 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
169 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", 169 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
170 totallen, (u32)pentry->phys_addr, pentry->irq); 170 totallen, (u32)pentry->phys_addr, pentry->irq);
171 mp_irq.type = MP_IOAPIC; 171 mp_irq.type = MP_INTSRC;
172 mp_irq.irqtype = mp_INT; 172 mp_irq.irqtype = mp_INT;
173 mp_irq.irqflag = 0xf; /* level trigger and active low */ 173 mp_irq.irqflag = 0xf; /* level trigger and active low */
174 mp_irq.srcbus = 0; 174 mp_irq.srcbus = MP_BUS_ISA;
175 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 175 mp_irq.srcbusirq = pentry->irq; /* IRQ */
176 mp_irq.dstapic = MP_APIC_ALL; 176 mp_irq.dstapic = MP_APIC_ALL;
177 mp_irq.dstirq = pentry->irq; 177 mp_irq.dstirq = pentry->irq;
@@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
194 return 0; 194 return 0;
195} 195}
196 196
197void __init mrst_time_init(void) 197static void __init mrst_time_init(void)
198{ 198{
199 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 199 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
200 switch (mrst_timer_options) { 200 switch (mrst_timer_options) {
@@ -216,7 +216,7 @@ void __init mrst_time_init(void)
216 apbt_time_init(); 216 apbt_time_init();
217} 217}
218 218
219void __cpuinit mrst_arch_setup(void) 219static void __cpuinit mrst_arch_setup(void)
220{ 220{
221 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) 221 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
222 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; 222 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
@@ -282,7 +282,7 @@ void __init x86_mrst_early_setup(void)
282 /* Avoid searching for BIOS MP tables */ 282 /* Avoid searching for BIOS MP tables */
283 x86_init.mpparse.find_smp_config = x86_init_noop; 283 x86_init.mpparse.find_smp_config = x86_init_noop;
284 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 284 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
285 285 set_bit(MP_BUS_ISA, mp_bus_not_pci);
286} 286}
287 287
288/* 288/*
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5d..81c5e2165c24 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,2 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5e..0060fd59ea00 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,7 @@
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h> 20#include <linux/platform_device.h>
21#include <linux/of.h>
21 22
22#include <asm/geode.h> 23#include <asm/geode.h>
23#include <asm/setup.h> 24#include <asm/setup.h>
@@ -187,41 +188,43 @@ err:
187} 188}
188EXPORT_SYMBOL_GPL(olpc_ec_cmd); 189EXPORT_SYMBOL_GPL(olpc_ec_cmd);
189 190
190static bool __init check_ofw_architecture(void) 191static bool __init check_ofw_architecture(struct device_node *root)
191{ 192{
192 size_t propsize; 193 const char *olpc_arch;
193 char olpc_arch[5]; 194 int propsize;
194 const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
195 void *res[] = { &propsize };
196 195
197 if (olpc_ofw("getprop", args, res)) { 196 olpc_arch = of_get_property(root, "architecture", &propsize);
198 printk(KERN_ERR "ofw: getprop call failed!\n");
199 return false;
200 }
201 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; 197 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
202} 198}
203 199
204static u32 __init get_board_revision(void) 200static u32 __init get_board_revision(struct device_node *root)
205{ 201{
206 size_t propsize; 202 int propsize;
207 __be32 rev; 203 const __be32 *rev;
208 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; 204
209 void *res[] = { &propsize }; 205 rev = of_get_property(root, "board-revision-int", &propsize);
210 206 if (propsize != 4)
211 if (olpc_ofw("getprop", args, res) || propsize != 4) { 207 return 0;
212 printk(KERN_ERR "ofw: getprop call failed!\n"); 208
213 return cpu_to_be32(0); 209 return be32_to_cpu(*rev);
214 }
215 return be32_to_cpu(rev);
216} 210}
217 211
218static bool __init platform_detect(void) 212static bool __init platform_detect(void)
219{ 213{
220 if (!check_ofw_architecture()) 214 struct device_node *root = of_find_node_by_path("/");
215 bool success;
216
217 if (!root)
221 return false; 218 return false;
222 olpc_platform_info.flags |= OLPC_F_PRESENT; 219
223 olpc_platform_info.boardrev = get_board_revision(); 220 success = check_ofw_architecture(root);
224 return true; 221 if (success) {
222 olpc_platform_info.boardrev = get_board_revision(root);
223 olpc_platform_info.flags |= OLPC_F_PRESENT;
224 }
225
226 of_node_put(root);
227 return success;
225} 228}
226 229
227static int __init add_xo1_platform_devices(void) 230static int __init add_xo1_platform_devices(void)
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 044bda5b3174..d39f63d017d2 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/of.h> 21#include <linux/of.h>
22#include <linux/of_platform.h>
22#include <linux/of_pdt.h> 23#include <linux/of_pdt.h>
24#include <asm/olpc.h>
23#include <asm/olpc_ofw.h> 25#include <asm/olpc_ofw.h>
24 26
25static phandle __init olpc_dt_getsibling(phandle node) 27static phandle __init olpc_dt_getsibling(phandle node)
@@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void)
180 pr_info("PROM DT: Built device tree with %u bytes of memory.\n", 182 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
181 prom_early_allocated); 183 prom_early_allocated);
182} 184}
185
186/* A list of DT node/bus matches that we want to expose as platform devices */
187static struct of_device_id __initdata of_ids[] = {
188 { .compatible = "olpc,xo1-battery" },
189 { .compatible = "olpc,xo1-dcon" },
190 { .compatible = "olpc,xo1-rtc" },
191 {},
192};
193
194static int __init olpc_create_platform_devices(void)
195{
196 if (machine_is_olpc())
197 return of_platform_bus_probe(NULL, of_ids, NULL);
198 else
199 return 0;
200}
201device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 7cb6424317f6..c58e0ea39ef5 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
699 struct mm_struct *mm, 699 struct mm_struct *mm,
700 unsigned long va, unsigned int cpu) 700 unsigned long va, unsigned int cpu)
701{ 701{
702 int tcpu;
703 int uvhub;
704 int locals = 0; 702 int locals = 0;
705 int remotes = 0; 703 int remotes = 0;
706 int hubs = 0; 704 int hubs = 0;
705 int tcpu;
706 int tpnode;
707 struct bau_desc *bau_desc; 707 struct bau_desc *bau_desc;
708 struct cpumask *flush_mask; 708 struct cpumask *flush_mask;
709 struct ptc_stats *stat; 709 struct ptc_stats *stat;
710 struct bau_control *bcp; 710 struct bau_control *bcp;
711 struct bau_control *tbcp; 711 struct bau_control *tbcp;
712 struct hub_and_pnode *hpp;
712 713
713 /* kernel was booted 'nobau' */ 714 /* kernel was booted 'nobau' */
714 if (nobau) 715 if (nobau)
@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
750 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 751 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
751 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 752 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
752 753
753 /* cpu statistics */
754 for_each_cpu(tcpu, flush_mask) { 754 for_each_cpu(tcpu, flush_mask) {
755 uvhub = uv_cpu_to_blade_id(tcpu); 755 /*
756 bau_uvhub_set(uvhub, &bau_desc->distribution); 756 * The distribution vector is a bit map of pnodes, relative
757 if (uvhub == bcp->uvhub) 757 * to the partition base pnode (and the partition base nasid
758 * in the header).
759 * Translate cpu to pnode and hub using an array stored
760 * in local memory.
761 */
762 hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
763 tpnode = hpp->pnode - bcp->partition_base_pnode;
764 bau_uvhub_set(tpnode, &bau_desc->distribution);
765 if (hpp->uvhub == bcp->uvhub)
758 locals++; 766 locals++;
759 else 767 else
760 remotes++; 768 remotes++;
@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
855 * an interrupt, but causes an error message to be returned to 863 * an interrupt, but causes an error message to be returned to
856 * the sender. 864 * the sender.
857 */ 865 */
858static void uv_enable_timeouts(void) 866static void __init uv_enable_timeouts(void)
859{ 867{
860 int uvhub; 868 int uvhub;
861 int nuvhubs; 869 int nuvhubs;
@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void)
1326} 1334}
1327 1335
1328/* 1336/*
1329 * initialize the sending side's sending buffers 1337 * Initialize the sending side's sending buffers.
1330 */ 1338 */
1331static void 1339static void
1332uv_activation_descriptor_init(int node, int pnode) 1340uv_activation_descriptor_init(int node, int pnode, int base_pnode)
1333{ 1341{
1334 int i; 1342 int i;
1335 int cpu; 1343 int cpu;
@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode)
1352 n = pa >> uv_nshift; 1360 n = pa >> uv_nshift;
1353 m = pa & uv_mmask; 1361 m = pa & uv_mmask;
1354 1362
1363 /* the 14-bit pnode */
1355 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 1364 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
1356 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 1365 (n << UV_DESC_BASE_PNODE_SHIFT | m));
1357
1358 /* 1366 /*
1359 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1367 * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
1360 * cpu even though we only use the first one; one descriptor can 1368 * cpu even though we only use the first one; one descriptor can
1361 * describe a broadcast to 256 uv hubs. 1369 * describe a broadcast to 256 uv hubs.
1362 */ 1370 */
@@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode)
1365 memset(bd2, 0, sizeof(struct bau_desc)); 1373 memset(bd2, 0, sizeof(struct bau_desc));
1366 bd2->header.sw_ack_flag = 1; 1374 bd2->header.sw_ack_flag = 1;
1367 /* 1375 /*
1368 * base_dest_nodeid is the nasid of the first uvhub 1376 * The base_dest_nasid set in the message header is the nasid
1369 * in the partition. The bit map will indicate uvhub numbers, 1377 * of the first uvhub in the partition. The bit map will
1370 * which are 0-N in a partition. Pnodes are unique system-wide. 1378 * indicate destination pnode numbers relative to that base.
1379 * They may not be consecutive if nasid striding is being used.
1371 */ 1380 */
1372 bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); 1381 bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
1373 bd2->header.dest_subnodeid = 0x10; /* the LB */ 1382 bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
1374 bd2->header.command = UV_NET_ENDPOINT_INTD; 1383 bd2->header.command = UV_NET_ENDPOINT_INTD;
1375 bd2->header.int_both = 1; 1384 bd2->header.int_both = 1;
1376 /* 1385 /*
@@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode)
1442/* 1451/*
1443 * Initialization of each UV hub's structures 1452 * Initialization of each UV hub's structures
1444 */ 1453 */
1445static void __init uv_init_uvhub(int uvhub, int vector) 1454static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
1446{ 1455{
1447 int node; 1456 int node;
1448 int pnode; 1457 int pnode;
@@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector)
1450 1459
1451 node = uvhub_to_first_node(uvhub); 1460 node = uvhub_to_first_node(uvhub);
1452 pnode = uv_blade_to_pnode(uvhub); 1461 pnode = uv_blade_to_pnode(uvhub);
1453 uv_activation_descriptor_init(node, pnode); 1462 uv_activation_descriptor_init(node, pnode, base_pnode);
1454 uv_payload_queue_init(node, pnode); 1463 uv_payload_queue_init(node, pnode);
1455 /* 1464 /*
1456 * the below initialization can't be in firmware because the 1465 * The below initialization can't be in firmware because the
1457 * messaging IRQ will be determined by the OS 1466 * messaging IRQ will be determined by the OS.
1458 */ 1467 */
1459 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; 1468 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1460 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1469 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -1491,10 +1500,11 @@ calculate_destination_timeout(void)
1491/* 1500/*
1492 * initialize the bau_control structure for each cpu 1501 * initialize the bau_control structure for each cpu
1493 */ 1502 */
1494static int __init uv_init_per_cpu(int nuvhubs) 1503static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
1495{ 1504{
1496 int i; 1505 int i;
1497 int cpu; 1506 int cpu;
1507 int tcpu;
1498 int pnode; 1508 int pnode;
1499 int uvhub; 1509 int uvhub;
1500 int have_hmaster; 1510 int have_hmaster;
@@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs)
1528 bcp = &per_cpu(bau_control, cpu); 1538 bcp = &per_cpu(bau_control, cpu);
1529 memset(bcp, 0, sizeof(struct bau_control)); 1539 memset(bcp, 0, sizeof(struct bau_control));
1530 pnode = uv_cpu_hub_info(cpu)->pnode; 1540 pnode = uv_cpu_hub_info(cpu)->pnode;
1541 if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
1542 printk(KERN_EMERG
1543 "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
1544 cpu, pnode, base_part_pnode,
1545 UV_DISTRIBUTION_SIZE);
1546 return 1;
1547 }
1548 bcp->osnode = cpu_to_node(cpu);
1549 bcp->partition_base_pnode = uv_partition_base_pnode;
1531 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1550 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1532 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); 1551 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1533 bdp = &uvhub_descs[uvhub]; 1552 bdp = &uvhub_descs[uvhub];
@@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs)
1536 bdp->pnode = pnode; 1555 bdp->pnode = pnode;
1537 /* kludge: 'assuming' one node per socket, and assuming that 1556 /* kludge: 'assuming' one node per socket, and assuming that
1538 disabling a socket just leaves a gap in node numbers */ 1557 disabling a socket just leaves a gap in node numbers */
1539 socket = (cpu_to_node(cpu) & 1); 1558 socket = bcp->osnode & 1;
1540 bdp->socket_mask |= (1 << socket); 1559 bdp->socket_mask |= (1 << socket);
1541 sdp = &bdp->socket[socket]; 1560 sdp = &bdp->socket[socket];
1542 sdp->cpu_number[sdp->num_cpus] = cpu; 1561 sdp->cpu_number[sdp->num_cpus] = cpu;
@@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs)
1585nextsocket: 1604nextsocket:
1586 socket++; 1605 socket++;
1587 socket_mask = (socket_mask >> 1); 1606 socket_mask = (socket_mask >> 1);
1607 /* each socket gets a local array of pnodes/hubs */
1608 bcp = smaster;
1609 bcp->target_hub_and_pnode = kmalloc_node(
1610 sizeof(struct hub_and_pnode) *
1611 num_possible_cpus(), GFP_KERNEL, bcp->osnode);
1612 memset(bcp->target_hub_and_pnode, 0,
1613 sizeof(struct hub_and_pnode) *
1614 num_possible_cpus());
1615 for_each_present_cpu(tcpu) {
1616 bcp->target_hub_and_pnode[tcpu].pnode =
1617 uv_cpu_hub_info(tcpu)->pnode;
1618 bcp->target_hub_and_pnode[tcpu].uvhub =
1619 uv_cpu_hub_info(tcpu)->numa_blade_id;
1620 }
1588 } 1621 }
1589 } 1622 }
1590 kfree(uvhub_descs); 1623 kfree(uvhub_descs);
@@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void)
1637 spin_lock_init(&disable_lock); 1670 spin_lock_init(&disable_lock);
1638 congested_cycles = microsec_2_cycles(congested_response_us); 1671 congested_cycles = microsec_2_cycles(congested_response_us);
1639 1672
1640 if (uv_init_per_cpu(nuvhubs)) {
1641 nobau = 1;
1642 return 0;
1643 }
1644
1645 uv_partition_base_pnode = 0x7fffffff; 1673 uv_partition_base_pnode = 0x7fffffff;
1646 for (uvhub = 0; uvhub < nuvhubs; uvhub++) 1674 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1647 if (uv_blade_nr_possible_cpus(uvhub) && 1675 if (uv_blade_nr_possible_cpus(uvhub) &&
1648 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) 1676 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
1649 uv_partition_base_pnode = uv_blade_to_pnode(uvhub); 1677 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
1678 }
1679
1680 if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
1681 nobau = 1;
1682 return 0;
1683 }
1650 1684
1651 vector = UV_BAU_MESSAGE; 1685 vector = UV_BAU_MESSAGE;
1652 for_each_possible_blade(uvhub) 1686 for_each_possible_blade(uvhub)
1653 if (uv_blade_nr_possible_cpus(uvhub)) 1687 if (uv_blade_nr_possible_cpus(uvhub))
1654 uv_init_uvhub(uvhub, vector); 1688 uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
1655 1689
1656 uv_enable_timeouts(); 1690 uv_enable_timeouts();
1657 alloc_intr_gate(vector, uv_bau_message_intr1); 1691 alloc_intr_gate(vector, uv_bau_message_intr1);
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f1..0eb90184515f 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
40 .rating = 400, 40 .rating = 400,
41 .read = uv_read_rtc, 41 .read = uv_read_rtc,
42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, 42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
43 .shift = 10,
44 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
45}; 44};
46 45
@@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void)
372 if (!is_uv_system()) 371 if (!is_uv_system())
373 return -ENODEV; 372 return -ENODEV;
374 373
375 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
376 clocksource_uv.shift);
377
378 /* If single blade, prefer tsc */ 374 /* If single blade, prefer tsc */
379 if (uv_num_possible_blades() == 1) 375 if (uv_num_possible_blades() == 1)
380 clocksource_uv.rating = 250; 376 clocksource_uv.rating = 250;
381 377
382 rc = clocksource_register(&clocksource_uv); 378 rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
383 if (rc) 379 if (rc)
384 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); 380 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
385 else 381 else
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b189bcd..bef0bc962400 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
11 11
12 12
13# files to link into the vdso 13# files to link into the vdso
14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
15 15
16# files to link into kernel 16# files to link into kernel
17obj-$(VDSO64-y) += vma.o vdso.o 17obj-$(VDSO64-y) += vma.o vdso.o
@@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
37$(obj)/%.so: $(obj)/%.so.dbg FORCE 37$(obj)/%.so: $(obj)/%.so.dbg FORCE
38 $(call if_changed,objcopy) 38 $(call if_changed,objcopy)
39 39
40#
41# Don't omit frame pointers for ease of userspace debugging, but do
42# optimize sibling calls.
43#
40CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ 44CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
41 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) 45 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
46 -fno-omit-frame-pointer -foptimize-sibling-calls
42 47
43$(vobjs): KBUILD_CFLAGS += $(CFL) 48$(vobjs): KBUILD_CFLAGS += $(CFL)
44 49
50#
51# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
52#
53CFLAGS_REMOVE_vdso-note.o = -pg
54CFLAGS_REMOVE_vclock_gettime.o = -pg
55CFLAGS_REMOVE_vgetcpu.o = -pg
56CFLAGS_REMOVE_vvar.o = -pg
57
45targets += vdso-syms.lds 58targets += vdso-syms.lds
46obj-$(VDSO64-y) += vdso-syms.lds 59obj-$(VDSO64-y) += vdso-syms.lds
47 60
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c5..a724905fdae7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,7 +2,7 @@
2 * Copyright 2006 Andi Kleen, SUSE Labs. 2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2 3 * Subject to the GNU Public License, v.2
4 * 4 *
5 * Fast user context implementation of clock_gettime and gettimeofday. 5 * Fast user context implementation of clock_gettime, gettimeofday, and time.
6 * 6 *
7 * The code should have no internal unresolved relocations. 7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 8 * Check with readelf after changing.
@@ -22,9 +22,8 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include "vextern.h"
26 25
27#define gtod vdso_vsyscall_gtod_data 26#define gtod (&VVAR(vsyscall_gtod_data))
28 27
29notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 28notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
30{ 29{
@@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
56 return 0; 55 return 0;
57} 56}
58 57
59/* Copy of the version in kernel/time.c which we cannot directly access */
60notrace static void
61vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
62{
63 while (nsec >= NSEC_PER_SEC) {
64 nsec -= NSEC_PER_SEC;
65 ++sec;
66 }
67 while (nsec < 0) {
68 nsec += NSEC_PER_SEC;
69 --sec;
70 }
71 ts->tv_sec = sec;
72 ts->tv_nsec = nsec;
73}
74
75notrace static noinline int do_monotonic(struct timespec *ts) 58notrace static noinline int do_monotonic(struct timespec *ts)
76{ 59{
77 unsigned long seq, ns, secs; 60 unsigned long seq, ns, secs;
@@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
82 secs += gtod->wall_to_monotonic.tv_sec; 65 secs += gtod->wall_to_monotonic.tv_sec;
83 ns += gtod->wall_to_monotonic.tv_nsec; 66 ns += gtod->wall_to_monotonic.tv_nsec;
84 } while (unlikely(read_seqretry(&gtod->lock, seq))); 67 } while (unlikely(read_seqretry(&gtod->lock, seq)));
85 vset_normalized_timespec(ts, secs, ns); 68
69 /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
70 * are all guaranteed to be nonnegative.
71 */
72 while (ns >= NSEC_PER_SEC) {
73 ns -= NSEC_PER_SEC;
74 ++secs;
75 }
76 ts->tv_sec = secs;
77 ts->tv_nsec = ns;
78
86 return 0; 79 return 0;
87} 80}
88 81
@@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
107 secs += gtod->wall_to_monotonic.tv_sec; 100 secs += gtod->wall_to_monotonic.tv_sec;
108 ns += gtod->wall_to_monotonic.tv_nsec; 101 ns += gtod->wall_to_monotonic.tv_nsec;
109 } while (unlikely(read_seqretry(&gtod->lock, seq))); 102 } while (unlikely(read_seqretry(&gtod->lock, seq)));
110 vset_normalized_timespec(ts, secs, ns); 103
104 /* wall_time_nsec and wall_to_monotonic.tv_nsec are
105 * guaranteed to be between 0 and NSEC_PER_SEC.
106 */
107 if (ns >= NSEC_PER_SEC) {
108 ns -= NSEC_PER_SEC;
109 ++secs;
110 }
111 ts->tv_sec = secs;
112 ts->tv_nsec = ns;
113
111 return 0; 114 return 0;
112} 115}
113 116
@@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
157} 160}
158int gettimeofday(struct timeval *, struct timezone *) 161int gettimeofday(struct timeval *, struct timezone *)
159 __attribute__((weak, alias("__vdso_gettimeofday"))); 162 __attribute__((weak, alias("__vdso_gettimeofday")));
163
164/* This will break when the xtime seconds get inaccurate, but that is
165 * unlikely */
166
167static __always_inline long time_syscall(long *t)
168{
169 long secs;
170 asm volatile("syscall"
171 : "=a" (secs)
172 : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
173 return secs;
174}
175
176notrace time_t __vdso_time(time_t *t)
177{
178 time_t result;
179
180 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
181 return time_syscall(t);
182
183 /* This is atomic on x86_64 so we don't need any locks. */
184 result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
185
186 if (t)
187 *t = result;
188 return result;
189}
190int time(time_t *t)
191 __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b4de7f..b96b2677cad8 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
23 __vdso_gettimeofday; 23 __vdso_gettimeofday;
24 getcpu; 24 getcpu;
25 __vdso_getcpu; 25 __vdso_getcpu;
26 time;
27 __vdso_time;
26 local: *; 28 local: *;
27 }; 29 };
28} 30}
29 31
30VDSO64_PRELINK = VDSO_PRELINK; 32VDSO64_PRELINK = VDSO_PRELINK;
31
32/*
33 * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
34 */
35#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
36#include "vextern.h"
37#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2ae3e8..000000000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
1#ifndef VEXTERN
2#include <asm/vsyscall.h>
3#define VEXTERN(x) \
4 extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
5#endif
6
7#define VMAGIC 0xfeedbabeabcdefabUL
8
9/* Any kernel variables used in the vDSO must be exported in the main
10 kernel's vmlinux.lds.S/vsyscall.h/proper __section and
11 put into vextern.h and be referenced as a pointer with vdso prefix.
12 The main kernel later fills in the values. */
13
14VEXTERN(jiffies)
15VEXTERN(vgetcpu_mode)
16VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b20026b..5463ad558573 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
11#include <linux/time.h> 11#include <linux/time.h>
12#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h"
15 14
16notrace long 15notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
18{ 17{
19 unsigned int p; 18 unsigned int p;
20 19
21 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { 20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
22 /* Load per CPU data from RDTSCP */ 21 /* Load per CPU data from RDTSCP */
23 native_read_tscp(&p); 22 native_read_tscp(&p);
24 } else { 23 } else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f108bb..7abd2be0f9b9 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,9 +15,6 @@
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17 17
18#include "vextern.h" /* Just for VMAGIC. */
19#undef VEXTERN
20
21unsigned int __read_mostly vdso_enabled = 1; 18unsigned int __read_mostly vdso_enabled = 1;
22 19
23extern char vdso_start[], vdso_end[]; 20extern char vdso_start[], vdso_end[];
@@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid;
26static struct page **vdso_pages; 23static struct page **vdso_pages;
27static unsigned vdso_size; 24static unsigned vdso_size;
28 25
29static inline void *var_ref(void *p, char *name)
30{
31 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0;
34 }
35 return p;
36}
37
38static int __init init_vdso_vars(void) 26static int __init init_vdso_vars(void)
39{ 27{
40 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; 28 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
41 int i; 29 int i;
42 char *vbase;
43 30
44 vdso_size = npages << PAGE_SHIFT; 31 vdso_size = npages << PAGE_SHIFT;
45 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 32 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
@@ -54,20 +41,6 @@ static int __init init_vdso_vars(void)
54 copy_page(page_address(p), vdso_start + i*PAGE_SIZE); 41 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
55 } 42 }
56 43
57 vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
58 if (!vbase)
59 goto oom;
60
61 if (memcmp(vbase, "\177ELF", 4)) {
62 printk("VDSO: I'm broken; not ELF\n");
63 vdso_enabled = 0;
64 }
65
66#define VEXTERN(x) \
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h"
69#undef VEXTERN
70 vunmap(vbase);
71 return 0; 44 return 0;
72 45
73 oom: 46 oom:
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703684f9..000000000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
1/* Define pointer to external vDSO variables.
2 These are part of the vDSO. The kernel fills in the real addresses
3 at boot time. This is done because when the vdso is linked the
4 kernel isn't yet and we don't know the final addresses. */
5#include <linux/kernel.h>
6#include <linux/time.h>
7#include <asm/vsyscall.h>
8#include <asm/timex.h>
9#include <asm/vgtod.h>
10
11#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
12#include "vextern.h"
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1c7121ba18ff..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY
39config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
40 bool 40 bool
41 depends on XEN 41 depends on XEN
42 select HIBERNATE_CALLBACKS
42 default y 43 default y
43 44
44config XEN_DEBUG_FS 45config XEN_DEBUG_FS
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49dbd78ec3cb..dd7b88f2ec7a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -235,9 +235,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
235 *dx &= maskedx; 235 *dx &= maskedx;
236} 236}
237 237
238static __init void xen_init_cpuid_mask(void) 238static void __init xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
249 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
250 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
251 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
252
253 ax = 1; 253 ax = 1;
254 cx = 0;
255 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
256 255
257 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
258 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
259 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
260
261 set_in_cr4(X86_CR4_OSXSAVE);
262
263 cr4 = read_cr4();
264 259
265 if ((cr4 & X86_CR4_OSXSAVE) == 0) 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
266 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); 261 if ((cx & xsave_mask) != xsave_mask)
267 262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
268 clear_in_cr4(X86_CR4_OSXSAVE);
269 }
270} 263}
271 264
272static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -407,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
407/* 400/*
408 * load_gdt for early boot, when the gdt is only mapped once 401 * load_gdt for early boot, when the gdt is only mapped once
409 */ 402 */
410static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) 403static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
411{ 404{
412 unsigned long va = dtr->address; 405 unsigned long va = dtr->address;
413 unsigned int size = dtr->size + 1; 406 unsigned int size = dtr->size + 1;
@@ -669,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
669 * Version of write_gdt_entry for use at early boot-time needed to 662 * Version of write_gdt_entry for use at early boot-time needed to
670 * update an entry as simply as possible. 663 * update an entry as simply as possible.
671 */ 664 */
672static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 665static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
673 const void *desc, int type) 666 const void *desc, int type)
674{ 667{
675 switch (type) { 668 switch (type) {
@@ -940,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
940 return ret; 933 return ret;
941} 934}
942 935
943static const struct pv_info xen_info __initdata = { 936static const struct pv_info xen_info __initconst = {
944 .paravirt_enabled = 1, 937 .paravirt_enabled = 1,
945 .shared_kernel_pmd = 0, 938 .shared_kernel_pmd = 0,
946 939
947 .name = "Xen", 940 .name = "Xen",
948}; 941};
949 942
950static const struct pv_init_ops xen_init_ops __initdata = { 943static const struct pv_init_ops xen_init_ops __initconst = {
951 .patch = xen_patch, 944 .patch = xen_patch,
952}; 945};
953 946
954static const struct pv_cpu_ops xen_cpu_ops __initdata = { 947static const struct pv_cpu_ops xen_cpu_ops __initconst = {
955 .cpuid = xen_cpuid, 948 .cpuid = xen_cpuid,
956 949
957 .set_debugreg = xen_set_debugreg, 950 .set_debugreg = xen_set_debugreg,
@@ -1011,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1011 .end_context_switch = xen_end_context_switch, 1004 .end_context_switch = xen_end_context_switch,
1012}; 1005};
1013 1006
1014static const struct pv_apic_ops xen_apic_ops __initdata = { 1007static const struct pv_apic_ops xen_apic_ops __initconst = {
1015#ifdef CONFIG_X86_LOCAL_APIC 1008#ifdef CONFIG_X86_LOCAL_APIC
1016 .startup_ipi_hook = paravirt_nop, 1009 .startup_ipi_hook = paravirt_nop,
1017#endif 1010#endif
@@ -1062,7 +1055,7 @@ int xen_panic_handler_init(void)
1062 return 0; 1055 return 0;
1063} 1056}
1064 1057
1065static const struct machine_ops __initdata xen_machine_ops = { 1058static const struct machine_ops xen_machine_ops __initconst = {
1066 .restart = xen_restart, 1059 .restart = xen_restart,
1067 .halt = xen_machine_halt, 1060 .halt = xen_machine_halt,
1068 .power_off = xen_machine_halt, 1061 .power_off = xen_machine_halt,
@@ -1339,7 +1332,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1339 return NOTIFY_OK; 1332 return NOTIFY_OK;
1340} 1333}
1341 1334
1342static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { 1335static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1343 .notifier_call = xen_hvm_cpu_notify, 1336 .notifier_call = xen_hvm_cpu_notify,
1344}; 1337};
1345 1338
@@ -1388,7 +1381,7 @@ bool xen_hvm_need_lapic(void)
1388} 1381}
1389EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); 1382EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1390 1383
1391const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1384const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
1392 .name = "Xen HVM", 1385 .name = "Xen HVM",
1393 .detect = xen_hvm_platform, 1386 .detect = xen_hvm_platform,
1394 .init_platform = xen_hvm_guest_init, 1387 .init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6a6fe8939645..8bbb465b6f0a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
113 xen_safe_halt(); 113 xen_safe_halt();
114} 114}
115 115
116static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initconst = {
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c82df6c9c0f0..dc708dcc62f1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -75,67 +75,12 @@
75#include "mmu.h" 75#include "mmu.h"
76#include "debugfs.h" 76#include "debugfs.h"
77 77
78#define MMU_UPDATE_HISTO 30
79
80/* 78/*
81 * Protects atomic reservation decrease/increase against concurrent increases. 79 * Protects atomic reservation decrease/increase against concurrent increases.
82 * Also protects non-atomic updates of current_pages and balloon lists. 80 * Also protects non-atomic updates of current_pages and balloon lists.
83 */ 81 */
84DEFINE_SPINLOCK(xen_reservation_lock); 82DEFINE_SPINLOCK(xen_reservation_lock);
85 83
86#ifdef CONFIG_XEN_DEBUG_FS
87
88static struct {
89 u32 pgd_update;
90 u32 pgd_update_pinned;
91 u32 pgd_update_batched;
92
93 u32 pud_update;
94 u32 pud_update_pinned;
95 u32 pud_update_batched;
96
97 u32 pmd_update;
98 u32 pmd_update_pinned;
99 u32 pmd_update_batched;
100
101 u32 pte_update;
102 u32 pte_update_pinned;
103 u32 pte_update_batched;
104
105 u32 mmu_update;
106 u32 mmu_update_extended;
107 u32 mmu_update_histo[MMU_UPDATE_HISTO];
108
109 u32 prot_commit;
110 u32 prot_commit_batched;
111
112 u32 set_pte_at;
113 u32 set_pte_at_batched;
114 u32 set_pte_at_pinned;
115 u32 set_pte_at_current;
116 u32 set_pte_at_kernel;
117} mmu_stats;
118
119static u8 zero_stats;
120
121static inline void check_zero(void)
122{
123 if (unlikely(zero_stats)) {
124 memset(&mmu_stats, 0, sizeof(mmu_stats));
125 zero_stats = 0;
126 }
127}
128
129#define ADD_STATS(elem, val) \
130 do { check_zero(); mmu_stats.elem += (val); } while(0)
131
132#else /* !CONFIG_XEN_DEBUG_FS */
133
134#define ADD_STATS(elem, val) do { (void)(val); } while(0)
135
136#endif /* CONFIG_XEN_DEBUG_FS */
137
138
139/* 84/*
140 * Identity map, in addition to plain kernel map. This needs to be 85 * Identity map, in addition to plain kernel map. This needs to be
141 * large enough to allocate page table pages to allocate the rest. 86 * large enough to allocate page table pages to allocate the rest.
@@ -243,11 +188,6 @@ static bool xen_page_pinned(void *ptr)
243 return PagePinned(page); 188 return PagePinned(page);
244} 189}
245 190
246static bool xen_iomap_pte(pte_t pte)
247{
248 return pte_flags(pte) & _PAGE_IOMAP;
249}
250
251void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 191void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
252{ 192{
253 struct multicall_space mcs; 193 struct multicall_space mcs;
@@ -257,7 +197,7 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
257 u = mcs.args; 197 u = mcs.args;
258 198
259 /* ptep might be kmapped when using 32-bit HIGHPTE */ 199 /* ptep might be kmapped when using 32-bit HIGHPTE */
260 u->ptr = arbitrary_virt_to_machine(ptep).maddr; 200 u->ptr = virt_to_machine(ptep).maddr;
261 u->val = pte_val_ma(pteval); 201 u->val = pte_val_ma(pteval);
262 202
263 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 203 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
@@ -266,11 +206,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
266} 206}
267EXPORT_SYMBOL_GPL(xen_set_domain_pte); 207EXPORT_SYMBOL_GPL(xen_set_domain_pte);
268 208
269static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
270{
271 xen_set_domain_pte(ptep, pteval, DOMID_IO);
272}
273
274static void xen_extend_mmu_update(const struct mmu_update *update) 209static void xen_extend_mmu_update(const struct mmu_update *update)
275{ 210{
276 struct multicall_space mcs; 211 struct multicall_space mcs;
@@ -279,27 +214,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
279 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 214 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
280 215
281 if (mcs.mc != NULL) { 216 if (mcs.mc != NULL) {
282 ADD_STATS(mmu_update_extended, 1);
283 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
284
285 mcs.mc->args[1]++; 217 mcs.mc->args[1]++;
286
287 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
288 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
289 else
290 ADD_STATS(mmu_update_histo[0], 1);
291 } else { 218 } else {
292 ADD_STATS(mmu_update, 1);
293 mcs = __xen_mc_entry(sizeof(*u)); 219 mcs = __xen_mc_entry(sizeof(*u));
294 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 220 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
295 ADD_STATS(mmu_update_histo[1], 1);
296 } 221 }
297 222
298 u = mcs.args; 223 u = mcs.args;
299 *u = *update; 224 *u = *update;
300} 225}
301 226
302void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 227static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
303{ 228{
304 struct mmu_update u; 229 struct mmu_update u;
305 230
@@ -312,17 +237,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
312 u.val = pmd_val_ma(val); 237 u.val = pmd_val_ma(val);
313 xen_extend_mmu_update(&u); 238 xen_extend_mmu_update(&u);
314 239
315 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
316
317 xen_mc_issue(PARAVIRT_LAZY_MMU); 240 xen_mc_issue(PARAVIRT_LAZY_MMU);
318 241
319 preempt_enable(); 242 preempt_enable();
320} 243}
321 244
322void xen_set_pmd(pmd_t *ptr, pmd_t val) 245static void xen_set_pmd(pmd_t *ptr, pmd_t val)
323{ 246{
324 ADD_STATS(pmd_update, 1);
325
326 /* If page is not pinned, we can just update the entry 247 /* If page is not pinned, we can just update the entry
327 directly */ 248 directly */
328 if (!xen_page_pinned(ptr)) { 249 if (!xen_page_pinned(ptr)) {
@@ -330,8 +251,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
330 return; 251 return;
331 } 252 }
332 253
333 ADD_STATS(pmd_update_pinned, 1);
334
335 xen_set_pmd_hyper(ptr, val); 254 xen_set_pmd_hyper(ptr, val);
336} 255}
337 256
@@ -344,35 +263,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
344 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 263 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
345} 264}
346 265
347void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 266static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
348 pte_t *ptep, pte_t pteval)
349{ 267{
350 if (xen_iomap_pte(pteval)) { 268 struct mmu_update u;
351 xen_set_iomap_pte(ptep, pteval); 269
352 goto out; 270 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
353 } 271 return false;
354 272
355 ADD_STATS(set_pte_at, 1); 273 xen_mc_batch();
356// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
357 ADD_STATS(set_pte_at_current, mm == current->mm);
358 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
359 274
360 if (mm == current->mm || mm == &init_mm) { 275 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
361 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 276 u.val = pte_val_ma(pteval);
362 struct multicall_space mcs; 277 xen_extend_mmu_update(&u);
363 mcs = xen_mc_entry(0);
364 278
365 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 279 xen_mc_issue(PARAVIRT_LAZY_MMU);
366 ADD_STATS(set_pte_at_batched, 1); 280
367 xen_mc_issue(PARAVIRT_LAZY_MMU); 281 return true;
368 goto out; 282}
369 } else 283
370 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) 284static void xen_set_pte(pte_t *ptep, pte_t pteval)
371 goto out; 285{
372 } 286 if (!xen_batched_set_pte(ptep, pteval))
373 xen_set_pte(ptep, pteval); 287 native_set_pte(ptep, pteval);
288}
374 289
375out: return; 290static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
291 pte_t *ptep, pte_t pteval)
292{
293 xen_set_pte(ptep, pteval);
376} 294}
377 295
378pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 296pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -389,13 +307,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
389 307
390 xen_mc_batch(); 308 xen_mc_batch();
391 309
392 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 310 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
393 u.val = pte_val_ma(pte); 311 u.val = pte_val_ma(pte);
394 xen_extend_mmu_update(&u); 312 xen_extend_mmu_update(&u);
395 313
396 ADD_STATS(prot_commit, 1);
397 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
398
399 xen_mc_issue(PARAVIRT_LAZY_MMU); 314 xen_mc_issue(PARAVIRT_LAZY_MMU);
400} 315}
401 316
@@ -463,7 +378,7 @@ static pteval_t iomap_pte(pteval_t val)
463 return val; 378 return val;
464} 379}
465 380
466pteval_t xen_pte_val(pte_t pte) 381static pteval_t xen_pte_val(pte_t pte)
467{ 382{
468 pteval_t pteval = pte.pte; 383 pteval_t pteval = pte.pte;
469 384
@@ -480,7 +395,7 @@ pteval_t xen_pte_val(pte_t pte)
480} 395}
481PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 396PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
482 397
483pgdval_t xen_pgd_val(pgd_t pgd) 398static pgdval_t xen_pgd_val(pgd_t pgd)
484{ 399{
485 return pte_mfn_to_pfn(pgd.pgd); 400 return pte_mfn_to_pfn(pgd.pgd);
486} 401}
@@ -511,7 +426,7 @@ void xen_set_pat(u64 pat)
511 WARN_ON(pat != 0x0007010600070106ull); 426 WARN_ON(pat != 0x0007010600070106ull);
512} 427}
513 428
514pte_t xen_make_pte(pteval_t pte) 429static pte_t xen_make_pte(pteval_t pte)
515{ 430{
516 phys_addr_t addr = (pte & PTE_PFN_MASK); 431 phys_addr_t addr = (pte & PTE_PFN_MASK);
517 432
@@ -565,13 +480,13 @@ pte_t xen_make_pte_debug(pteval_t pte)
565 if (io_page && 480 if (io_page &&
566 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { 481 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
567 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; 482 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
568 WARN(addr != other_addr, 483 WARN_ONCE(addr != other_addr,
569 "0x%lx is using VM_IO, but it is 0x%lx!\n", 484 "0x%lx is using VM_IO, but it is 0x%lx!\n",
570 (unsigned long)addr, (unsigned long)other_addr); 485 (unsigned long)addr, (unsigned long)other_addr);
571 } else { 486 } else {
572 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; 487 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
573 other_addr = (_pte.pte & PTE_PFN_MASK); 488 other_addr = (_pte.pte & PTE_PFN_MASK);
574 WARN((addr == other_addr) && (!io_page) && (!iomap_set), 489 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
575 "0x%lx is missing VM_IO (and wasn't fixed)!\n", 490 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
576 (unsigned long)addr); 491 (unsigned long)addr);
577 } 492 }
@@ -581,20 +496,20 @@ pte_t xen_make_pte_debug(pteval_t pte)
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); 496PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif 497#endif
583 498
584pgd_t xen_make_pgd(pgdval_t pgd) 499static pgd_t xen_make_pgd(pgdval_t pgd)
585{ 500{
586 pgd = pte_pfn_to_mfn(pgd); 501 pgd = pte_pfn_to_mfn(pgd);
587 return native_make_pgd(pgd); 502 return native_make_pgd(pgd);
588} 503}
589PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
590 505
591pmdval_t xen_pmd_val(pmd_t pmd) 506static pmdval_t xen_pmd_val(pmd_t pmd)
592{ 507{
593 return pte_mfn_to_pfn(pmd.pmd); 508 return pte_mfn_to_pfn(pmd.pmd);
594} 509}
595PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 510PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
596 511
597void xen_set_pud_hyper(pud_t *ptr, pud_t val) 512static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
598{ 513{
599 struct mmu_update u; 514 struct mmu_update u;
600 515
@@ -607,17 +522,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
607 u.val = pud_val_ma(val); 522 u.val = pud_val_ma(val);
608 xen_extend_mmu_update(&u); 523 xen_extend_mmu_update(&u);
609 524
610 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
611
612 xen_mc_issue(PARAVIRT_LAZY_MMU); 525 xen_mc_issue(PARAVIRT_LAZY_MMU);
613 526
614 preempt_enable(); 527 preempt_enable();
615} 528}
616 529
617void xen_set_pud(pud_t *ptr, pud_t val) 530static void xen_set_pud(pud_t *ptr, pud_t val)
618{ 531{
619 ADD_STATS(pud_update, 1);
620
621 /* If page is not pinned, we can just update the entry 532 /* If page is not pinned, we can just update the entry
622 directly */ 533 directly */
623 if (!xen_page_pinned(ptr)) { 534 if (!xen_page_pinned(ptr)) {
@@ -625,56 +536,28 @@ void xen_set_pud(pud_t *ptr, pud_t val)
625 return; 536 return;
626 } 537 }
627 538
628 ADD_STATS(pud_update_pinned, 1);
629
630 xen_set_pud_hyper(ptr, val); 539 xen_set_pud_hyper(ptr, val);
631} 540}
632 541
633void xen_set_pte(pte_t *ptep, pte_t pte)
634{
635 if (xen_iomap_pte(pte)) {
636 xen_set_iomap_pte(ptep, pte);
637 return;
638 }
639
640 ADD_STATS(pte_update, 1);
641// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
642 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
643
644#ifdef CONFIG_X86_PAE 542#ifdef CONFIG_X86_PAE
645 ptep->pte_high = pte.pte_high; 543static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
646 smp_wmb();
647 ptep->pte_low = pte.pte_low;
648#else
649 *ptep = pte;
650#endif
651}
652
653#ifdef CONFIG_X86_PAE
654void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
655{ 544{
656 if (xen_iomap_pte(pte)) {
657 xen_set_iomap_pte(ptep, pte);
658 return;
659 }
660
661 set_64bit((u64 *)ptep, native_pte_val(pte)); 545 set_64bit((u64 *)ptep, native_pte_val(pte));
662} 546}
663 547
664void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 548static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
665{ 549{
666 ptep->pte_low = 0; 550 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
667 smp_wmb(); /* make sure low gets written first */ 551 native_pte_clear(mm, addr, ptep);
668 ptep->pte_high = 0;
669} 552}
670 553
671void xen_pmd_clear(pmd_t *pmdp) 554static void xen_pmd_clear(pmd_t *pmdp)
672{ 555{
673 set_pmd(pmdp, __pmd(0)); 556 set_pmd(pmdp, __pmd(0));
674} 557}
675#endif /* CONFIG_X86_PAE */ 558#endif /* CONFIG_X86_PAE */
676 559
677pmd_t xen_make_pmd(pmdval_t pmd) 560static pmd_t xen_make_pmd(pmdval_t pmd)
678{ 561{
679 pmd = pte_pfn_to_mfn(pmd); 562 pmd = pte_pfn_to_mfn(pmd);
680 return native_make_pmd(pmd); 563 return native_make_pmd(pmd);
@@ -682,13 +565,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
682PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 565PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
683 566
684#if PAGETABLE_LEVELS == 4 567#if PAGETABLE_LEVELS == 4
685pudval_t xen_pud_val(pud_t pud) 568static pudval_t xen_pud_val(pud_t pud)
686{ 569{
687 return pte_mfn_to_pfn(pud.pud); 570 return pte_mfn_to_pfn(pud.pud);
688} 571}
689PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 572PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
690 573
691pud_t xen_make_pud(pudval_t pud) 574static pud_t xen_make_pud(pudval_t pud)
692{ 575{
693 pud = pte_pfn_to_mfn(pud); 576 pud = pte_pfn_to_mfn(pud);
694 577
@@ -696,7 +579,7 @@ pud_t xen_make_pud(pudval_t pud)
696} 579}
697PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 580PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
698 581
699pgd_t *xen_get_user_pgd(pgd_t *pgd) 582static pgd_t *xen_get_user_pgd(pgd_t *pgd)
700{ 583{
701 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 584 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
702 unsigned offset = pgd - pgd_page; 585 unsigned offset = pgd - pgd_page;
@@ -728,7 +611,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
728 * 2. It is always pinned 611 * 2. It is always pinned
729 * 3. It has no user pagetable attached to it 612 * 3. It has no user pagetable attached to it
730 */ 613 */
731void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 614static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
732{ 615{
733 preempt_disable(); 616 preempt_disable();
734 617
@@ -741,12 +624,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
741 preempt_enable(); 624 preempt_enable();
742} 625}
743 626
744void xen_set_pgd(pgd_t *ptr, pgd_t val) 627static void xen_set_pgd(pgd_t *ptr, pgd_t val)
745{ 628{
746 pgd_t *user_ptr = xen_get_user_pgd(ptr); 629 pgd_t *user_ptr = xen_get_user_pgd(ptr);
747 630
748 ADD_STATS(pgd_update, 1);
749
750 /* If page is not pinned, we can just update the entry 631 /* If page is not pinned, we can just update the entry
751 directly */ 632 directly */
752 if (!xen_page_pinned(ptr)) { 633 if (!xen_page_pinned(ptr)) {
@@ -758,9 +639,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
758 return; 639 return;
759 } 640 }
760 641
761 ADD_STATS(pgd_update_pinned, 1);
762 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
763
764 /* If it's pinned, then we can at least batch the kernel and 642 /* If it's pinned, then we can at least batch the kernel and
765 user updates together. */ 643 user updates together. */
766 xen_mc_batch(); 644 xen_mc_batch();
@@ -1054,7 +932,7 @@ void xen_mm_pin_all(void)
1054 * that's before we have page structures to store the bits. So do all 932 * that's before we have page structures to store the bits. So do all
1055 * the book-keeping now. 933 * the book-keeping now.
1056 */ 934 */
1057static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, 935static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
1058 enum pt_level level) 936 enum pt_level level)
1059{ 937{
1060 SetPagePinned(page); 938 SetPagePinned(page);
@@ -1162,14 +1040,14 @@ void xen_mm_unpin_all(void)
1162 spin_unlock(&pgd_lock); 1040 spin_unlock(&pgd_lock);
1163} 1041}
1164 1042
1165void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1043static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1166{ 1044{
1167 spin_lock(&next->page_table_lock); 1045 spin_lock(&next->page_table_lock);
1168 xen_pgd_pin(next); 1046 xen_pgd_pin(next);
1169 spin_unlock(&next->page_table_lock); 1047 spin_unlock(&next->page_table_lock);
1170} 1048}
1171 1049
1172void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1050static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1173{ 1051{
1174 spin_lock(&mm->page_table_lock); 1052 spin_lock(&mm->page_table_lock);
1175 xen_pgd_pin(mm); 1053 xen_pgd_pin(mm);
@@ -1187,7 +1065,7 @@ static void drop_other_mm_ref(void *info)
1187 1065
1188 active_mm = percpu_read(cpu_tlbstate.active_mm); 1066 active_mm = percpu_read(cpu_tlbstate.active_mm);
1189 1067
1190 if (active_mm == mm) 1068 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1191 leave_mm(smp_processor_id()); 1069 leave_mm(smp_processor_id());
1192 1070
1193 /* If this cpu still has a stale cr3 reference, then make sure 1071 /* If this cpu still has a stale cr3 reference, then make sure
@@ -1256,7 +1134,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1256 * pagetable because of lazy tlb flushing. This means we need need to 1134 * pagetable because of lazy tlb flushing. This means we need need to
1257 * switch all CPUs off this pagetable before we can unpin it. 1135 * switch all CPUs off this pagetable before we can unpin it.
1258 */ 1136 */
1259void xen_exit_mmap(struct mm_struct *mm) 1137static void xen_exit_mmap(struct mm_struct *mm)
1260{ 1138{
1261 get_cpu(); /* make sure we don't move around */ 1139 get_cpu(); /* make sure we don't move around */
1262 xen_drop_mm_ref(mm); 1140 xen_drop_mm_ref(mm);
@@ -1271,13 +1149,27 @@ void xen_exit_mmap(struct mm_struct *mm)
1271 spin_unlock(&mm->page_table_lock); 1149 spin_unlock(&mm->page_table_lock);
1272} 1150}
1273 1151
1274static __init void xen_pagetable_setup_start(pgd_t *base) 1152static void __init xen_pagetable_setup_start(pgd_t *base)
1275{ 1153{
1276} 1154}
1277 1155
1156static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1157{
1158 /* reserve the range used */
1159 native_pagetable_reserve(start, end);
1160
1161 /* set as RW the rest */
1162 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1163 PFN_PHYS(pgt_buf_top));
1164 while (end < PFN_PHYS(pgt_buf_top)) {
1165 make_lowmem_page_readwrite(__va(end));
1166 end += PAGE_SIZE;
1167 }
1168}
1169
1278static void xen_post_allocator_init(void); 1170static void xen_post_allocator_init(void);
1279 1171
1280static __init void xen_pagetable_setup_done(pgd_t *base) 1172static void __init xen_pagetable_setup_done(pgd_t *base)
1281{ 1173{
1282 xen_setup_shared_info(); 1174 xen_setup_shared_info();
1283 xen_post_allocator_init(); 1175 xen_post_allocator_init();
@@ -1473,16 +1365,20 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1473#endif 1365#endif
1474} 1366}
1475 1367
1476static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1477{
1478 unsigned long pfn = pte_pfn(pte);
1479
1480#ifdef CONFIG_X86_32 1368#ifdef CONFIG_X86_32
1369static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1370{
1481 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1371 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1482 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1372 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1483 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1373 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1484 pte_val_ma(pte)); 1374 pte_val_ma(pte));
1485#endif 1375
1376 return pte;
1377}
1378#else /* CONFIG_X86_64 */
1379static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1380{
1381 unsigned long pfn = pte_pfn(pte);
1486 1382
1487 /* 1383 /*
1488 * If the new pfn is within the range of the newly allocated 1384 * If the new pfn is within the range of the newly allocated
@@ -1491,16 +1387,17 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1491 * it is RO. 1387 * it is RO.
1492 */ 1388 */
1493 if (((!is_early_ioremap_ptep(ptep) && 1389 if (((!is_early_ioremap_ptep(ptep) &&
1494 pfn >= pgt_buf_start && pfn < pgt_buf_end)) || 1390 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1495 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) 1391 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1496 pte = pte_wrprotect(pte); 1392 pte = pte_wrprotect(pte);
1497 1393
1498 return pte; 1394 return pte;
1499} 1395}
1396#endif /* CONFIG_X86_64 */
1500 1397
1501/* Init-time set_pte while constructing initial pagetables, which 1398/* Init-time set_pte while constructing initial pagetables, which
1502 doesn't allow RO pagetable pages to be remapped RW */ 1399 doesn't allow RO pagetable pages to be remapped RW */
1503static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 1400static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1504{ 1401{
1505 pte = mask_rw_pte(ptep, pte); 1402 pte = mask_rw_pte(ptep, pte);
1506 1403
@@ -1518,7 +1415,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1518 1415
1519/* Early in boot, while setting up the initial pagetable, assume 1416/* Early in boot, while setting up the initial pagetable, assume
1520 everything is pinned. */ 1417 everything is pinned. */
1521static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1418static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1522{ 1419{
1523#ifdef CONFIG_FLATMEM 1420#ifdef CONFIG_FLATMEM
1524 BUG_ON(mem_map); /* should only be used early */ 1421 BUG_ON(mem_map); /* should only be used early */
@@ -1528,7 +1425,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1528} 1425}
1529 1426
1530/* Used for pmd and pud */ 1427/* Used for pmd and pud */
1531static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1428static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1532{ 1429{
1533#ifdef CONFIG_FLATMEM 1430#ifdef CONFIG_FLATMEM
1534 BUG_ON(mem_map); /* should only be used early */ 1431 BUG_ON(mem_map); /* should only be used early */
@@ -1538,13 +1435,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1538 1435
1539/* Early release_pte assumes that all pts are pinned, since there's 1436/* Early release_pte assumes that all pts are pinned, since there's
1540 only init_mm and anything attached to that is pinned. */ 1437 only init_mm and anything attached to that is pinned. */
1541static __init void xen_release_pte_init(unsigned long pfn) 1438static void __init xen_release_pte_init(unsigned long pfn)
1542{ 1439{
1543 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1440 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1544 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1441 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1545} 1442}
1546 1443
1547static __init void xen_release_pmd_init(unsigned long pfn) 1444static void __init xen_release_pmd_init(unsigned long pfn)
1548{ 1445{
1549 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1550} 1447}
@@ -1670,7 +1567,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
1670 BUG(); 1567 BUG();
1671} 1568}
1672 1569
1673static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1570static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1674{ 1571{
1675 unsigned pmdidx, pteidx; 1572 unsigned pmdidx, pteidx;
1676 unsigned ident_pte; 1573 unsigned ident_pte;
@@ -1753,7 +1650,7 @@ static void convert_pfn_mfn(void *v)
1753 * of the physical mapping once some sort of allocator has been set 1650 * of the physical mapping once some sort of allocator has been set
1754 * up. 1651 * up.
1755 */ 1652 */
1756__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1653pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1757 unsigned long max_pfn) 1654 unsigned long max_pfn)
1758{ 1655{
1759 pud_t *l3; 1656 pud_t *l3;
@@ -1824,7 +1721,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1824static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 1721static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1825static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 1722static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1826 1723
1827static __init void xen_write_cr3_init(unsigned long cr3) 1724static void __init xen_write_cr3_init(unsigned long cr3)
1828{ 1725{
1829 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 1726 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1830 1727
@@ -1861,7 +1758,7 @@ static __init void xen_write_cr3_init(unsigned long cr3)
1861 pv_mmu_ops.write_cr3 = &xen_write_cr3; 1758 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1862} 1759}
1863 1760
1864__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1761pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1865 unsigned long max_pfn) 1762 unsigned long max_pfn)
1866{ 1763{
1867 pmd_t *kernel_pmd; 1764 pmd_t *kernel_pmd;
@@ -1967,7 +1864,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1967#endif 1864#endif
1968} 1865}
1969 1866
1970__init void xen_ident_map_ISA(void) 1867void __init xen_ident_map_ISA(void)
1971{ 1868{
1972 unsigned long pa; 1869 unsigned long pa;
1973 1870
@@ -1990,7 +1887,7 @@ __init void xen_ident_map_ISA(void)
1990 xen_flush_tlb(); 1887 xen_flush_tlb();
1991} 1888}
1992 1889
1993static __init void xen_post_allocator_init(void) 1890static void __init xen_post_allocator_init(void)
1994{ 1891{
1995#ifdef CONFIG_XEN_DEBUG 1892#ifdef CONFIG_XEN_DEBUG
1996 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 1893 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
@@ -2027,7 +1924,7 @@ static void xen_leave_lazy_mmu(void)
2027 preempt_enable(); 1924 preempt_enable();
2028} 1925}
2029 1926
2030static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1927static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2031 .read_cr2 = xen_read_cr2, 1928 .read_cr2 = xen_read_cr2,
2032 .write_cr2 = xen_write_cr2, 1929 .write_cr2 = xen_write_cr2,
2033 1930
@@ -2100,6 +1997,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2100 1997
2101void __init xen_init_mmu_ops(void) 1998void __init xen_init_mmu_ops(void)
2102{ 1999{
2000 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2103 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2001 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2104 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2002 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2105 pv_mmu_ops = xen_mmu_ops; 2003 pv_mmu_ops = xen_mmu_ops;
@@ -2351,7 +2249,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2351 struct remap_data *rmd = data; 2249 struct remap_data *rmd = data;
2352 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); 2250 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2353 2251
2354 rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; 2252 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2355 rmd->mmu_update->val = pte_val_ma(pte); 2253 rmd->mmu_update->val = pte_val_ma(pte);
2356 rmd->mmu_update++; 2254 rmd->mmu_update++;
2357 2255
@@ -2405,7 +2303,6 @@ out:
2405EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2303EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2406 2304
2407#ifdef CONFIG_XEN_DEBUG_FS 2305#ifdef CONFIG_XEN_DEBUG_FS
2408
2409static int p2m_dump_open(struct inode *inode, struct file *filp) 2306static int p2m_dump_open(struct inode *inode, struct file *filp)
2410{ 2307{
2411 return single_open(filp, p2m_dump_show, NULL); 2308 return single_open(filp, p2m_dump_show, NULL);
@@ -2417,65 +2314,4 @@ static const struct file_operations p2m_dump_fops = {
2417 .llseek = seq_lseek, 2314 .llseek = seq_lseek,
2418 .release = single_release, 2315 .release = single_release,
2419}; 2316};
2420 2317#endif /* CONFIG_XEN_DEBUG_FS */
2421static struct dentry *d_mmu_debug;
2422
2423static int __init xen_mmu_debugfs(void)
2424{
2425 struct dentry *d_xen = xen_init_debugfs();
2426
2427 if (d_xen == NULL)
2428 return -ENOMEM;
2429
2430 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2431
2432 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2433
2434 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2435 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2436 &mmu_stats.pgd_update_pinned);
2437 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2438 &mmu_stats.pgd_update_pinned);
2439
2440 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2441 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2442 &mmu_stats.pud_update_pinned);
2443 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2444 &mmu_stats.pud_update_pinned);
2445
2446 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2447 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2448 &mmu_stats.pmd_update_pinned);
2449 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2450 &mmu_stats.pmd_update_pinned);
2451
2452 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2453// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2454// &mmu_stats.pte_update_pinned);
2455 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2456 &mmu_stats.pte_update_pinned);
2457
2458 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2459 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2460 &mmu_stats.mmu_update_extended);
2461 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2462 mmu_stats.mmu_update_histo, 20);
2463
2464 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2465 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2466 &mmu_stats.set_pte_at_batched);
2467 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2468 &mmu_stats.set_pte_at_current);
2469 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2470 &mmu_stats.set_pte_at_kernel);
2471
2472 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2473 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2474 &mmu_stats.prot_commit_batched);
2475
2476 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2477 return 0;
2478}
2479fs_initcall(xen_mmu_debugfs);
2480
2481#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 537bb9aab777..73809bb951b4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15 15
16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
17 17
18
19void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
20void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
21void xen_exit_mmap(struct mm_struct *mm);
22
23pteval_t xen_pte_val(pte_t);
24pmdval_t xen_pmd_val(pmd_t);
25pgdval_t xen_pgd_val(pgd_t);
26
27pte_t xen_make_pte(pteval_t);
28pmd_t xen_make_pmd(pmdval_t);
29pgd_t xen_make_pgd(pgdval_t);
30
31void xen_set_pte(pte_t *ptep, pte_t pteval);
32void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
33 pte_t *ptep, pte_t pteval);
34
35#ifdef CONFIG_X86_PAE
36void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
37void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
38void xen_pmd_clear(pmd_t *pmdp);
39#endif /* CONFIG_X86_PAE */
40
41void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
42void xen_set_pud(pud_t *ptr, pud_t val);
43void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
44void xen_set_pud_hyper(pud_t *ptr, pud_t val);
45
46#if PAGETABLE_LEVELS == 4
47pudval_t xen_pud_val(pud_t pud);
48pud_t xen_make_pud(pudval_t pudval);
49void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
50void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
51#endif
52
53pgd_t *xen_get_user_pgd(pgd_t *pgd);
54
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 18pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 19void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
57 pte_t *ptep, pte_t pte); 20 pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b06..58efeb9d5440 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
522 /* Boundary cross-over for the edges: */ 522 /* Boundary cross-over for the edges: */
523 if (idx) { 523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); 524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525 unsigned long *mid_mfn_p;
525 526
526 p2m_init(p2m); 527 p2m_init(p2m);
527 528
528 p2m_top[topidx][mididx] = p2m; 529 p2m_top[topidx][mididx] = p2m;
529 530
531 /* For save/restore we need to MFN of the P2M saved */
532
533 mid_mfn_p = p2m_top_mfn_p[topidx];
534 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
535 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
536 topidx, mididx);
537 mid_mfn_p[mididx] = virt_to_mfn(p2m);
538
530 } 539 }
531 return idx != 0; 540 return idx != 0;
532} 541}
@@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) 558 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 { 559 {
551 unsigned topidx = p2m_top_index(pfn); 560 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) { 561 unsigned long *mid_mfn_p;
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); 562 unsigned long **mid;
563
564 mid = p2m_top[topidx];
565 mid_mfn_p = p2m_top_mfn_p[topidx];
566 if (mid == p2m_mid_missing) {
567 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554 568
555 p2m_mid_init(mid); 569 p2m_mid_init(mid);
556 570
557 p2m_top[topidx] = mid; 571 p2m_top[topidx] = mid;
572
573 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
574 }
575 /* And the save/restore P2M tables.. */
576 if (mid_mfn_p == p2m_mid_missing_mfn) {
577 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
578 p2m_mid_mfn_init(mid_mfn_p);
579
580 p2m_top_mfn_p[topidx] = mid_mfn_p;
581 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
582 /* Note: we don't set mid_mfn_p[midix] here,
583 * look in __early_alloc_p2m */
558 } 584 }
559 } 585 }
560 586
@@ -650,7 +676,7 @@ static unsigned long mfn_hash(unsigned long mfn)
650} 676}
651 677
652/* Add an MFN override for a particular page */ 678/* Add an MFN override for a particular page */
653int m2p_add_override(unsigned long mfn, struct page *page) 679int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
654{ 680{
655 unsigned long flags; 681 unsigned long flags;
656 unsigned long pfn; 682 unsigned long pfn;
@@ -662,7 +688,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
662 if (!PageHighMem(page)) { 688 if (!PageHighMem(page)) {
663 address = (unsigned long)__va(pfn << PAGE_SHIFT); 689 address = (unsigned long)__va(pfn << PAGE_SHIFT);
664 ptep = lookup_address(address, &level); 690 ptep = lookup_address(address, &level);
665
666 if (WARN(ptep == NULL || level != PG_LEVEL_4K, 691 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
667 "m2p_add_override: pfn %lx not mapped", pfn)) 692 "m2p_add_override: pfn %lx not mapped", pfn))
668 return -EINVAL; 693 return -EINVAL;
@@ -674,18 +699,17 @@ int m2p_add_override(unsigned long mfn, struct page *page)
674 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) 699 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
675 return -ENOMEM; 700 return -ENOMEM;
676 701
677 if (!PageHighMem(page)) 702 if (clear_pte && !PageHighMem(page))
678 /* Just zap old mapping for now */ 703 /* Just zap old mapping for now */
679 pte_clear(&init_mm, address, ptep); 704 pte_clear(&init_mm, address, ptep);
680
681 spin_lock_irqsave(&m2p_override_lock, flags); 705 spin_lock_irqsave(&m2p_override_lock, flags);
682 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 706 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
683 spin_unlock_irqrestore(&m2p_override_lock, flags); 707 spin_unlock_irqrestore(&m2p_override_lock, flags);
684 708
685 return 0; 709 return 0;
686} 710}
687 711EXPORT_SYMBOL_GPL(m2p_add_override);
688int m2p_remove_override(struct page *page) 712int m2p_remove_override(struct page *page, bool clear_pte)
689{ 713{
690 unsigned long flags; 714 unsigned long flags;
691 unsigned long mfn; 715 unsigned long mfn;
@@ -713,7 +737,7 @@ int m2p_remove_override(struct page *page)
713 spin_unlock_irqrestore(&m2p_override_lock, flags); 737 spin_unlock_irqrestore(&m2p_override_lock, flags);
714 set_phys_to_machine(pfn, page->index); 738 set_phys_to_machine(pfn, page->index);
715 739
716 if (!PageHighMem(page)) 740 if (clear_pte && !PageHighMem(page))
717 set_pte_at(&init_mm, address, ptep, 741 set_pte_at(&init_mm, address, ptep,
718 pfn_pte(pfn, PAGE_KERNEL)); 742 pfn_pte(pfn, PAGE_KERNEL));
719 /* No tlb flush necessary because the caller already 743 /* No tlb flush necessary because the caller already
@@ -721,6 +745,7 @@ int m2p_remove_override(struct page *page)
721 745
722 return 0; 746 return 0;
723} 747}
748EXPORT_SYMBOL_GPL(m2p_remove_override);
724 749
725struct page *m2p_find_override(unsigned long mfn) 750struct page *m2p_find_override(unsigned long mfn)
726{ 751{
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index bfd0632fe65e..b480d4207a4c 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
36 36
37 /* If running as PV guest, either iommu=soft, or swiotlb=force will 37 /* If running as PV guest, either iommu=soft, or swiotlb=force will
38 * activate this IOMMU. If running as PV privileged, activate it 38 * activate this IOMMU. If running as PV privileged, activate it
39 * irregardlesss. 39 * irregardless.
40 */ 40 */
41 if ((xen_initial_domain() || swiotlb || swiotlb_force) && 41 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
42 (xen_pv_domain())) 42 (xen_pv_domain()))
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a99377..be1a464f6d66 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
50 */ 50 */
51#define EXTRA_MEM_RATIO (10) 51#define EXTRA_MEM_RATIO (10)
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static void __init xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn; 55 unsigned long pfn;
56 56
@@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
166 if (last > end) 166 if (last > end)
167 continue; 167 continue;
168 168
169 if (entry->type == E820_RAM) { 169 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
170 if (start > start_pci) 170 if (start > start_pci)
171 identity += set_phys_range_identity( 171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start)); 172 PFN_UP(start_pci), PFN_DOWN(start));
@@ -227,7 +227,11 @@ char * __init xen_memory_setup(void)
227 227
228 memcpy(map_raw, map, sizeof(map)); 228 memcpy(map_raw, map, sizeof(map));
229 e820.nr_map = 0; 229 e820.nr_map = 0;
230#ifdef CONFIG_X86_32
230 xen_extra_mem_start = mem_end; 231 xen_extra_mem_start = mem_end;
232#else
233 xen_extra_mem_start = max((1ULL << 32), mem_end);
234#endif
231 for (i = 0; i < memmap.nr_entries; i++) { 235 for (i = 0; i < memmap.nr_entries; i++) {
232 unsigned long long end; 236 unsigned long long end;
233 237
@@ -336,7 +340,7 @@ static void __init fiddle_vdso(void)
336#endif 340#endif
337} 341}
338 342
339static __cpuinit int register_callback(unsigned type, const void *func) 343static int __cpuinit register_callback(unsigned type, const void *func)
340{ 344{
341 struct callback_register callback = { 345 struct callback_register callback = {
342 .type = type, 346 .type = type,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed99..41038c01de40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
47 47
48/* 48/*
49 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
50 * all the work is done automatically when
51 * we return from the interrupt.
52 */ 50 */
53static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
54{ 52{
55 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
56 55
57 return IRQ_HANDLED; 56 return IRQ_HANDLED;
58} 57}
59 58
60static __cpuinit void cpu_bringup(void) 59static void __cpuinit cpu_bringup(void)
61{ 60{
62 int cpu = smp_processor_id(); 61 int cpu = smp_processor_id();
63 62
@@ -85,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
85 wmb(); /* make sure everything is out */ 84 wmb(); /* make sure everything is out */
86} 85}
87 86
88static __cpuinit void cpu_bringup_and_idle(void) 87static void __cpuinit cpu_bringup_and_idle(void)
89{ 88{
90 cpu_bringup(); 89 cpu_bringup();
91 cpu_idle(); 90 cpu_idle();
@@ -242,7 +241,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
242 } 241 }
243} 242}
244 243
245static __cpuinit int 244static int __cpuinit
246cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 245cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
247{ 246{
248 struct vcpu_guest_context *ctxt; 247 struct vcpu_guest_context *ctxt;
@@ -486,7 +485,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
486 return IRQ_HANDLED; 485 return IRQ_HANDLED;
487} 486}
488 487
489static const struct smp_ops xen_smp_ops __initdata = { 488static const struct smp_ops xen_smp_ops __initconst = {
490 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 489 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
491 .smp_prepare_cpus = xen_smp_prepare_cpus, 490 .smp_prepare_cpus = xen_smp_prepare_cpus,
492 .smp_cpus_done = xen_smp_cpus_done, 491 .smp_cpus_done = xen_smp_cpus_done,
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b1..5158c505bef9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
26 26
27#include "xen-ops.h" 27#include "xen-ops.h"
28 28
29#define XEN_SHIFT 22
30
31/* Xen may fire a timer up to this many ns early */ 29/* Xen may fire a timer up to this many ns early */
32#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
33#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
211 .rating = 400, 209 .rating = 400,
212 .read = xen_clocksource_get_cycles, 210 .read = xen_clocksource_get_cycles,
213 .mask = ~0, 211 .mask = ~0,
214 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
215 .shift = XEN_SHIFT,
216 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 212 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
217}; 213};
218 214
@@ -439,16 +435,16 @@ void xen_timer_resume(void)
439 } 435 }
440} 436}
441 437
442static const struct pv_time_ops xen_time_ops __initdata = { 438static const struct pv_time_ops xen_time_ops __initconst = {
443 .sched_clock = xen_clocksource_read, 439 .sched_clock = xen_clocksource_read,
444}; 440};
445 441
446static __init void xen_time_init(void) 442static void __init xen_time_init(void)
447{ 443{
448 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
449 struct timespec tp; 445 struct timespec tp;
450 446
451 clocksource_register(&xen_clocksource); 447 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
452 448
453 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
454 /* Successfully turned off 100Hz tick, so we have the 450 /* Successfully turned off 100Hz tick, so we have the
@@ -468,7 +464,7 @@ static __init void xen_time_init(void)
468 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
469} 465}
470 466
471__init void xen_init_time_ops(void) 467void __init xen_init_time_ops(void)
472{ 468{
473 pv_time_ops = xen_time_ops; 469 pv_time_ops = xen_time_ops;
474 470
@@ -490,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
490 xen_setup_cpu_clockevents(); 486 xen_setup_cpu_clockevents();
491} 487}
492 488
493__init void xen_hvm_init_time_ops(void) 489void __init xen_hvm_init_time_ops(void)
494{ 490{
495 /* vector callback is needed otherwise we cannot receive interrupts 491 /* vector callback is needed otherwise we cannot receive interrupts
496 * on cpu > 0 and at this point we don't know how many cpus are 492 * on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3112f55638c4..97dfdc8757b3 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {}
74 74
75#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS
76void __init xen_init_spinlocks(void); 76void __init xen_init_spinlocks(void);
77__cpuinit void xen_init_lock_cpu(int cpu); 77void __cpuinit xen_init_lock_cpu(int cpu);
78void xen_uninit_lock_cpu(int cpu); 78void xen_uninit_lock_cpu(int cpu);
79#else 79#else
80static inline void xen_init_spinlocks(void) 80static inline void xen_init_spinlocks(void)